This IPython Notebook allows to reproduce the results of the paper Fine-grained Benchmark Subsetting for System Selection accepted for publication at the 2014 International Symposium on Code Generation and Optimization.
The Notebook and the original experimental datasets can be downloaded to replay the data analysis on your own computer.
Our subsetting framework depends on a set of GNU R libraries, that we have to load.
%load_ext rmagic
%%R
require(ggplot2)
require(plyr)
require(scales)
require(grid)
require(reshape)
require(reshape2)
require(permute)
require(GMD)
require(lattice)
require(ggdendro)
require(RGraphics)
require(gridExtra)
We now define some constants and configurations parameters. Ensure that DATA_PATH
points to the directory containing the experimental datasets.
%%R
# Declare directory from where to load the experimental data sets
DATA_PATH <<-"."
# Set to a local directory to produce plots as pdf
PDF_OUTPUT_DIR <- ""
# Declare the architecture names
arch_names <- list(
'sandybridge'="Sandy Bridge",
'core2'="Core 2",
'atom'="Atom"
)
# Architecture clk frequencies (used to convert RDTSC cycles to time)
clks = list("nehalem" = 1860*10^3,
"atom" = 1662*10^3,
"core2" = 2933*10^3,
"sandybridge" = 3330*10^3)
# Declare well-behaved threshold
wellbehaved = 10
# Declare feature set
feature_set = c(
"DP.MFlops.s..DP.assumed.",
"L2.bandwidth..MBytes.s.",
"L3.miss.rate",
"Memory.bandwidth..MBytes.s.",
"Nb.FLOP..div",
"Nb.insn..SD",
"Nb.uops.P1",
"Ratio.ADD.SUB...MUL",
"RecMII..cycles.",
"Vec..ratio......mul..FP.",
"Vec..ratio......other",
"Vec..ratio......other..FP.",
"X.L1..Bytes.stored...cycle",
"X.L1..IPC")
# Declare color palette
colours <- c("#000000","#AE1C3E", "#F5A275")
In this section we implement the clustering algorithm used to subset representatives. We also implement functions required to predict the full benchmark suite from the representative's measures.
First, we need a function to load the experimental datasets.
%%R
# new_session loads our experimental dataset for a given benchmark
# and target architecture
new_session <- function(bench, target) {
ref_file = paste(DATA_PATH, "/", bench, "-reference.csv", sep="")
tar_file = paste(DATA_PATH, "/", bench, "-", target,".csv", sep="")
features_file = paste(DATA_PATH, "/", bench, "-features.csv", sep="")
ref = read.csv(ref_file, header=T)
tar = read.csv(tar_file, header=T)
S = list(data = merge(ref,tar, by=c("BenchName", "CodeletName")),
features = read.csv(features_file, header=T),
tar_clk = clks[[target]],
ref_clk = clks[["nehalem"]])
# Sort features data frame with the same order than the timing data
S$features = S$features[order(match(S$features[,"CodeletName"], S$data[,"CodeletName"])),]
return(S)
}
Features are cleaned and scaled before clustering the codelets.
%%R
#
# scale_features: scales the features, so they have the same weight
# -> as a result each feature as variance 1 and mean 0 across all codelets
#
scale_features <- function(features) {
# remove non numeric columns
features = features[sapply(features, is.numeric)]
# replace NA by zero
features[is.na(features)] <- 0
# scale features
features = scale(features)
# remove all-zero columns
features = features[,!(colSums(abs(features)) == 0)]
return (data.frame(features))
}
Groups of similar codelets are gathered using Ward's hierarchical clustering on the feature vectors. As described in the paper, we modify the clustering to guarantee that every cluster contains at least one well-behaved codelet.
%%R
# Hierarchical clustering
#
# myhiera: does a hierarchical clustering using ward criteria of the codelets
# k is the desired number of clusters
# if k == -1 (the default) the elbow method is used to automatically select
# an optimal number of clusters
#
myhiera <- function(d, features, k=-1) {
fit <- hclust(d, method="ward")
if (k != -1) {
clusters = cutree(fit, k=k)
} else {
# apply elbow method
css.obj <- css.hclust(d, fit, k=nrow(features))
elbow.obj <- elbow.batch(css.obj)
clusters = cutree(fit, k = elbow.obj$k)
}
return (list(clusters=clusters, tree=fit))
}
#
# distancematrix: returns the symmetric matrix of the distances
# between all pairs of codelets
#
distancematrix <- function(features) {
d = dist(features, method="euclidean")
return(d)
}
#
# closest_not_in: used to migrate ill-behaved codelets during representative
# selection. returns the cluster closest to the given codelet that is not
# in the `not_in` set of codelets
#
closest_not_in <- function(CodeletsNames, dist, CodeletName, not_in) {
pos = which(CodeletsNames == CodeletName)
dv = data.frame(CodeletName = CodeletsNames, dists = dist[pos,])
dv = dv[!(dv$CodeletName %in% not_in), ]
dv = dv[order(dv$dists),]
return(dv[1,]$CodeletName)
}
#
# cluster_codelets: returns a hierarchical clustering of codelets that
# guarantees that each cluster contains at least a well-behaved codelet
#
cluster_codelets <- function(S, feature_names, number_of_clusters) {
# Scale the features
features = S$features[feature_names]
features = scale_features(features)
# Compute the distance matrix
distances = distancematrix(features)
D = as.matrix(distances)
# Compute a Ward's hierarchical clustering
hiera = myhiera(distances, features, number_of_clusters)
clusters = data.frame(CodeletName = S$features$CodeletName,
Cluster = hiera[["clusters"]])
# Find clusters without at least a well-behaved codelet
# and migrate them to other clusters
S$data$Cluster = clusters$Cluster
S$data$dists = c(0)
for (cluster in unique(S$data$Cluster)) {
cl = S$data[S$data$Cluster==cluster,]
dists = cl$CPI_mismatch
S$data[S$data$Cluster==cluster,]$dists = dists
if (min(dists) > wellbehaved) {
for (codelet in cl$CodeletName) {
closest = closest_not_in(S$features$CodeletName,
D,
codelet,
cl$CodeletName)
closest_cluster = S$data[S$data$CodeletName == closest,]$Cluster
S$data[S$data$CodeletName == codelet,]$Cluster = closest_cluster
}
}
}
S$data$Cluster = as.numeric(as.factor(S$data$Cluster))
return(S$data)
}
Once clusters are formed, we elect one representative per cluster. Representatives must be well-behaved and close to the cluster centroid.
%%R
# Representative election
#
# clust.centroid: returns the centroid of a cluster
#
clust.centroid = function(i, dat, clusters) {
ind = (clusters == i)
return(colMeans(dat[ind,, drop=FALSE]))
}
#
# elect_representatives: elect a well-behaved codelet as representative in
# each cluster. The codelet is chosen as the closest well-behaved codelet
# to the centroid.
#
elect_representatives <- function(S, feature_names, data) {
features = S$features[feature_names]
features = scale_features(features)
#compute centroid of each cluster
Centroids <- sapply(sort(unique(data$Cluster)), clust.centroid, features, data$Cluster)
features$CodeletName = S$features$CodeletName
data$is.representative = F
for (cl in unique(data$Cluster)) {
feat_cl = features[data$Cluster==cl & data$CPI_mismatch <= wellbehaved,]
feat_cl$CPI_mismatch = data[data$Cluster==cl & data$CPI_mismatch <= wellbehaved,]$CPI_mismatch
if (nrow(feat_cl) == 0) {
feat_cl = features[data$Cluster==cl,]
feat_cl$CPI_mismatch = data[data$Cluster==cl,]$CPI_mismatch
}
#compute for each codelet distance from centroid
feat_cl = ddply(feat_cl, c("CodeletName"), function(x) {
x$CodeletName <- NULL
CPI_mismatch = x$CPI_mismatch
x$CPI_mismatch <- NULL
if (length(x) == 1) {
c = Centroids[cl]
} else {
c = Centroids[, cl]
}
data.frame(DistToCentroid = sum((x - c) ^ 2), CPI_mismatch = CPI_mismatch)
})
feat_cl <- feat_cl[order(feat_cl$CPI_mismatch), ] #order second by matching distance
feat_cl <- feat_cl[order(feat_cl$DistToCentroid), ] #order first by distance to centroid
# we select as representative the closest codelet to the centroid
data[data$Cluster==cl, ]$is.representative =
ifelse(data[data$Cluster==cl, ]$CodeletName %in% feat_cl[1, ]$CodeletName, T, F)
}
return(data)
}
The following functions predict the performance of all the codelets or all the applications given the representative's measures on a target architecture.
%%R
#
# cycle_prediction: predict cycles per invocation for all the codelets
#
cycle_prediction <- function(S, feature_names, data) {
data = elect_representatives(S, feature_names, data)
#Isolate our representatives
representatives <- data[data$is.representative, ]
#Now we need to compute for each cluster the speedup
representatives = within(representatives,
{Speedup =
tar_vitro_CPInv/ref_vivo_CPInv})
# Order representatives per clusters
representatives <- representatives[order(representatives$Cluster), ]
# Merge prediction Speedup, real Speedup and predicted cycles into data
representatives <- representatives[!duplicated(representatives$Cluster), ]
data$Speedup = representatives[data$Cluster, "Speedup"]
data$realSpeedup = (data$tar_vivo_CPInv) / (data$ref_vivo_CPInv)
data$Predicted_cyclesPerIteration = data$ref_vivo_CPInv * data$Speedup
data$per_err = abs(data$tar_vivo_CPInv - data$Predicted_cyclesPerIteration)/
pmax(data$Predicted_cyclesPerIteration, data$tar_vivo_CPInv)
return(data)
}
#
# compute_appli_cycles: predict each application run time by aggregating
# its codelets predictions.
#
compute_appli_cycles <- function(S, data) {
tmp = ddply(data, c("BenchName"), function(x) {
clusterTotal = sum(x$ref_vivo_invocations * x$ref_vivo_CPInv)
data.frame(Speedup =
1/sum((x$ref_vivo_invocations * x$ref_vivo_CPInv)/(clusterTotal*x$Speedup)))
}
)
data <- data[!duplicated(data$BenchName), ]
data <- data[order(data$BenchName), ]
tmp$Reference_APPLI_TIME = data$ref_app_cycles/S$ref_clk
tmp$Target_APPLI_TIME = data$tar_app_cycles/S$tar_clk
tmp$PredictedTime = (data$ref_app_cycles*tmp$Speedup)/S$tar_clk
return(tmp)
}
Finally, we need a set of functions to evaluate the accuracy and benchmark reduction factor of our method.
%%R
#
# compute_benchmaring_cost: computes the reduction achieved by benchmarking the codelets
# instead of the original applications
#
compute_benchmarking_cost <- function(S, data) {
data$BenchmarkTime = data$tar_vitro_CPInv*data$tar_vitro_invocations/S$tar_clk
data = data[data$is.representative,]
return(sum(data$BenchmarkTime))
}
#
# prediction_statistics: computes a set of statistics about the quality and speed of the
# prediction.
#
prediction_statistics <- function(S, data) {
score_stats = data.frame(
mean_per_err = mean(data$per_err),
median_per_err = median(data$per_err),
number_of_clusters = max(data$Cluster),
benchmark_cost_ms = compute_benchmarking_cost(S, data)
)
return(score_stats)
}
#
# gm_mean: returns the geometric mean
#
gm_mean = function(a){prod(a)^(1/length(a))}
#
# compute_global_speedup: computes the speedup for a whole architecture
# by returning the geometric mean of the applications speedups.
#
compute_global_speedup <- function(appli_prediction) {
x=appli_prediction
x$real_speedup = (x$Reference_APPLI_TIME)/(x$Target_APPLI_TIME)
x$predicted_speedup = (x$Reference_APPLI_TIME)/(x$PredictedTime)
appli_benchmark_cost_ms = sum(x$Target_APPLI_TIME)
real_global_speedup = gm_mean(x$real_speedup)
predicted_global_speedup = gm_mean(x$predicted_speedup)
error = abs(real_global_speedup-predicted_global_speedup)/real_global_speedup
stats = data.frame(real_speedup = real_global_speedup,
predicted_speedup = predicted_global_speedup,
appli_benchmark_cost_ms = appli_benchmark_cost_ms,
error = error)
return(stats)
}
# Utility functions:
# Keep only codelets from one single application
keep_only_app <- function(S, app) {
S$data = S$data[S$data$BenchName == app, ]
S$features = S$features[S$features$BenchName == app, ]
return(S)
}
In this section, we produce the Numerical Recipes plots and tables used in the paper. You can reproduce these results by downloading this notebook and running it in your computer.
%%R
# Setup experiment parameters for NR results
number_of_clusters=14
bench="NR"
architecture="atom"
%%R -w 177 -h 89 -u mm -r 100
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "nr-table.pdf", sep="/")
pdf(outputpdf, width=7, height=3.5)
}
# Create a new session with the adequate data
S = new_session(bench, architecture)
# Clean the feature list
features = S$features[feature_set]
features = scale_features(features)
# Compute the distance in the feature space between every pair of codelets
distances = distancematrix(features)
# Compute a hierarchical clustering dendrogram
fit <- hclust(distances, method="ward")
fit = as.dendrogram(fit)
# Run the clustering and prediction method
clusters = cluster_codelets(S, feature_set, number_of_clusters)
prediction = cycle_prediction(S, feature_set, clusters)
prediction$Vec.Ratio.= round(S$features$Vec..ratio......all..FP)
# Read high-level classification data
hlclass = read.csv(paste(DATA_PATH,"hlclassification.csv",sep="/"), header=T, sep=",")
prediction = merge(prediction, hlclass, by="CodeletName")
# Reorder table prediction using the dendrogram codelet order
newprediction = data.frame()
labels <- prediction$CodeletName
for (i in labels[order.dendrogram(fit)]) {
newprediction = rbind(prediction[prediction$Codelet == i,], newprediction)
}
# Reorder cluster numbers by dendrogram
newprediction$Cluster = as.integer(factor(as.character(newprediction$Cluster),
levels=unique(as.character(newprediction$Cluster))))
prediction = newprediction
# Internally times (and therefore accelerations) are relative to machine cycles
# convert to absolute time using the architecture clocks frequencies
freq_ratio = S$ref_clk / S$tar_clk
prediction[,"realSpeedup"] <- 1/(prediction[,"realSpeedup"] * (freq_ratio))
prediction[,"Speedup"] = 1/(prediction[,"Speedup"] * (freq_ratio))
# Put acceleration of representatives between < >
isrepr = prediction[,"is.representative"] == "TRUE"
prediction[!isrepr,"realSpeedup"] = sprintf("%.4s",prediction[!isrepr,"realSpeedup"])
prediction[isrepr,"realSpeedup"] = sprintf("<%.4s>",prediction[isrepr,"realSpeedup"])
# Clean the codelet names suffixes
cleanCodeletNames <- function(D) {
for (suffix in c("_dp_sse", "_bet1_dt0_sse_initbet1", "_sp_sse", "_square_sp_sse", "_mp_sse"))
{
D$CodeletName = factor(sub(suffix, "", D$CodeletName))
}
D$CodeletName = factor(sub("hqr_12_square", "hqr_12_sq", D$CodeletName))
return(D)
}
prediction = cleanCodeletNames(prediction)
# Summarize codelet data by cluster number
D = ddply(prediction, .(Cluster), summarize,
Codelet=CodeletName,
"Computation Pattern"=Algorithm,
Stride=Array.Access..Stride,
Vec.=Vectorization,
"Vec. %" = Vec.Ratio.,
s=realSpeedup)
# Rename column Cluster to C
colnames(D)[1] = "C"
# Clean the D table so identical clusters in successive lines
# are not repeated
cleanf <- function(x){
oldx <- c(FALSE, x[-1]==x[-length(x)])
res <- x
res[oldx] <- ""
res
}
D[c("C")] <- lapply(D[c("C")], cleanf)
# Compute cluster separations row numbers
seps = as.list(which(D$C != "")-1)
# Plot the dendrogram and the table
par(mar=c(0,0,0,0), no.readonly=TRUE)
plot.new()
gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,5), 'null'), heights=unit(c(1), 'null'))
vp.1 <- viewport(layout.pos.col=1, layout.pos.row=1)
vp.2 <- viewport(layout.pos.col=2, layout.pos.row=1)
pushViewport(viewport(layout=gl))
pushViewport(vp.1)
# Plot Dendrogram
p = ggdendrogram(fit, rotate=TRUE, labels=F)
p = p + theme(plot.margin = unit(c(0.2,-1.38,-0.94,0), "cm"))
p = p + geom_hline(yintercept=4.5, linetype="dashed", color="darkred")
p = p + annotate("text", size=3, y=14, x=1, color="darkred", label="cut for K = 14")
print(p, newpage=F)
popViewport()
pushViewport(vp.2)
# Plot Table
grid.table(D, gp =gpar(fontsize=6.5, lwd=1), show.rownames=F, padding.v = unit(1.2, "mm"),
padding.h = unit(3.4, "mm"))
# Plot Separators
for (s in seps[-1]) {
ss = nrow(D)-s
y = unit(1,"mm")+unit(ss*2.985,"mm")
grid.segments(0.058, y, 1-0.058, y)
}
popViewport()
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 5 -h 3 -u in -r 150
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "nr-codelets.pdf", sep="/")
pdf(outputpdf, width=3.7, height=3)
}
data_sorted = prediction
#Put angle-brackets around representative names
list_rep = data_sorted[data_sorted[,"is.representative"]=="TRUE","CodeletName"]
levels(data_sorted$CodeletName) <- c(levels(data_sorted$CodeletName),
paste("<", list_rep ,">",sep=""))
data_sorted[data_sorted[,"is.representative"]=="TRUE","CodeletName"] = paste("<",
list_rep ,">",sep="")
#Compute prediction statistics
stats = prediction_statistics(S, data_sorted)
# Choose clusters to plot
data_sorted = data_sorted[data_sorted$Cluster==1 | data_sorted$Cluster==2,]
# Prepare plot labels
data_sorted$info_plot =
paste(
paste("error = ", signif(data_sorted$per_err*100,3), "%", sep = ""),
"\n\n",sep="")
data_sorted[,"Speedup"] = sprintf("%.4s",data_sorted[,"Speedup"])
data_sorted[,"Speedup"] = paste("Cluster ",data_sorted[,"Cluster"], " ",
"(s = ",data_sorted[,"Speedup"],")",sep="")
# Set codelet order in plot
data_sorted$CodeletName = factor(data_sorted$CodeletName,
levels=factor(c("<toeplz_1>","rstrct_29","mprove_8",
"toeplz_4","<realft_4>")))
# Plot
p <- ggplot(data_sorted, aes())
p <- p + geom_point(aes(y=ref_vivo_CPInv/S$ref_clk,
x=CodeletName, col="R", shape="R"),
alpha=1,size=3)
# Plot prediction
iteration = 1
for (element in data_sorted$is.representative) {
colour_info = "black"
transparency = 1
if (element == "TRUE")
{
colour_info = "darkgreen"
transparency = 1
} else {
colour_info = "black"
transparency = 0.75
}
p <- p + geom_text(data=data_sorted[iteration, ],aes(y=ref_vivo_CPInv/S$ref_clk,
x=CodeletName,col="R",
shape="R"),
hjust=0.25, vjust=0.5, size=2.5, alpha=transparency,
colour=colour_info, angle=90)
p <- p + aes_string(label = "info_plot")
iteration = iteration + 1
}
p <- p + geom_point(aes(y=tar_vivo_CPInv/S$tar_clk, x=CodeletName,col="T1",
shape="T1"), alpha=0.8,size=3)
p <- p + geom_point(aes(y=Predicted_cyclesPerIteration/S$tar_clk,x=CodeletName, col="T2",
shape="T2"), alpha=0.8,size=3)
p <- p + geom_segment(aes(y=ref_vivo_CPInv/S$ref_clk,x = CodeletName,
yend=Predicted_cyclesPerIteration/S$tar_clk, xend=CodeletName),
arrow=arrow(length=unit(0.2,"cm")), size=0.3, color="blue", alpha=0.8)
p <- p + scale_y_log10(breaks=c(1,2,5,10,20,40,80,160,320), limits=c(2,160))
p <- p + labs(y = "Execution time (ms / invocation)", x = "")
p <- p + theme(axis.text.x = element_blank())
p <- p + theme(legend.position="top")
p <- p + facet_wrap(~Speedup, scales="free_x")
p <- p + theme_bw(base_size=9)
p <- p + theme(legend.position="top", legend.direction="horizontal")
p <- p + theme(legend.key = element_blank())
p <- p + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- p + scale_color_manual("", values=colours,
labels=c("Reference (Nehalem)", "Atom real", "Atom predicted"))
p <- p + scale_shape_manual("",
labels=c("Reference (Nehalem)", "Atom real", "Atom predicted"),
values=c(3,4,5))
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R
all = data.frame()
for (nclusters in c(14,-1)) {
for (architecture in c("atom","sandybridge")) {
S= new_session(bench, architecture)
clusters = cluster_codelets(S, feature_set, nclusters)
prediction = cycle_prediction(S, feature_set, clusters)
stats = prediction_statistics(S, prediction)
# Create precentage
stats$median_per_err = stats$median_per_err * 100
stats$mean_per_err = stats$mean_per_err * 100
D = data.frame(Architecture = as.factor(as.character(arch_names[architecture])),
Clusters = nclusters,
"Median Error" = signif(stats$median_per_err,2),
"Average Error" = signif(stats$mean_per_err,2))
all = rbind(all, D)
}
}
print(all, row.names=F)
In this section, we produce the NAS Serial plots and tables used in the paper. You can reproduce these results by downloading this notebook and running it in your computer.
%%R
# Setup experiment parameters for NAS SER results
bench="NAS"
architecture="sandybridge"
number_of_clusters=-1
%%R -w 7 -h 3 -u in -r 300
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "nas-codelets.pdf", sep="/")
pdf(outputpdf, width=7, height=3.6)
}
# Load session data
S= new_session(bench, architecture)
# Run the benchmark reduction
clusters = cluster_codelets(S, feature_set, number_of_clusters)
prediction = cycle_prediction(S, feature_set, clusters)
stats = prediction_statistics(S, prediction)
# Sort codelets by running time
data_sorted = arrange(prediction, -ref_vivo_CPInv)
data_sorted$CodeletName = factor(data_sorted$CodeletName, levels=unique(data_sorted$CodeletName))
# Plot data, group by application
p <- ggplot(data_sorted, aes(x=CodeletName, col=ARCH, shape=ARCH))
p <- p + geom_point(aes(y=ref_vivo_CPInv/S$ref_clk,
col="R", shape="R"), size=1.5)
p <- p + geom_point(aes(y=tar_vivo_CPInv/S$tar_clk,
col="T2", shape="T2"), size=1.5)
p <- p + geom_point(aes(y=Predicted_cyclesPerIteration/S$tar_clk,
col="T1", shape="T1"), size=1.5)
p <- p + scale_y_log10(breaks = c(1,2,5,10,25,50,100,200,500,1000,2000,4000))
p <- p + labs(y = "Execution time (ms / invocation)", x=NULL)
p <- p + facet_wrap(~Cluster ~Speedup, scales="free_x")
p <- p + scale_x_discrete(breaks = NA)
p <- p + theme_bw(base_size=9)
p <- p + scale_color_manual("", values=colours, labels=c("Reference (Nehalem)",
"Sandy Bridge real",
"Sandy Bridge predicted"))
p <- p + scale_shape_manual("", labels=c("Reference (Nehalem)",
"Sandy Bridge real",
"Sandy Bridge predicted"),
values=c(3,4,5))
p <- p + theme(legend.position="top") + theme(legend.key = element_blank())
p <- p + facet_wrap(~BenchName, nrow=1, scales="free")
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 4 -h 5 -u in -r 150
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "nas-applications.pdf", sep="/")
pdf(outputpdf, width=3.5, height=4)
}
all = data.frame()
# Gather application prediction statistics on the three target architectures
for (arch in c("atom", "core2", "sandybridge")) {
architecture=arch
S= new_session(bench, architecture)
number_of_clusters=-1
clusters = cluster_codelets(S, feature_set, number_of_clusters)
prediction = cycle_prediction(S, feature_set, clusters)
appli = compute_appli_cycles(S, prediction)
appli$Architecture = as.factor(as.character(arch_names[arch]))
all = rbind(all, appli)
}
# Plot the data, group by architecture
all$Diff <- NULL
all$Speedup <- NULL
short.m <- melt(all, id.vars=c("Architecture", "BenchName"))
p <- ggplot(short.m, aes(x=BenchName, y=value/1000, fill=variable))
p <- p + geom_bar(stat="identity", position=position_dodge(), width=0.8)
p <- p + labs(y = "Execution time (s)", x = "")
p <- p + scale_fill_manual(values=colours, labels = c("Reference", "Real", "Predicted"))
p <- p + facet_wrap(~ Architecture, ncol=1,scale="free_y")
p <- p + guides(fill=guide_legend(title=NULL))
p <- p + theme_bw(base_size=9)
p <- p + theme(legend.key= element_blank(), legend.position="top", legend.direction="horizontal")
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 3 -h 3 -u in -r 150
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "speedup.pdf", sep="/")
pdf(outputpdf, width=3, height=2.5)
}
# Gather data from previous cell
all2 = data.frame(real_spu = all$Reference_APPLI_TIME/all$Target_APPLI_TIME,
predicted_spu = all$Reference_APPLI_TIME/all$PredictedTime,
BenchName = all$BenchName,
Architecture = all$Architecture)
# Compute geometrical mean by architecture
all2 = ddply(all2, .(Architecture), summarize, real_spu = gm_mean(real_spu),
predicted_spu = gm_mean(predicted_spu))
# Plot the data by architecture
short.m = melt(all2, id.vars=c("Architecture"))
p <- ggplot(short.m, aes(x=Architecture, y=value, fill=variable))
p <- p + geom_bar(stat="identity", position=position_dodge())
p <- p + labs(y="Geometric mean speedup", x = NULL)
p <- p + scale_fill_manual(values=colours[-c(1)], labels = c("Real Speedup", "Predicted Speedup"))
p <- p + ylim(0,2.2)
p <- p + geom_hline(yintercept=1.0, linetype="dashed")
p <- p + geom_text(aes(label=format(round(value,2),2)), position=position_dodge(width=0.9),
vjust=-1, color="black", size=3)
p <- p + guides(fill=guide_legend(title=NULL))
p <- p + theme_bw(base_size=9)
p <- p + theme(legend.key= element_blank(),legend.justification = c(0, 1),
legend.position = c(0, 1))
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 7 -h 3 -u in -r 300
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "speedupvserror.pdf", sep="/")
pdf(outputpdf, width=7, height=3)
}
# Aggregate data for the three target architectures
all = data.frame()
for (arch in c("atom", "core2", "sandybridge")) {
architecture=arch
number_of_clusters=-1
S= new_session(bench, architecture)
clusters = cluster_codelets(S, feature_set, number_of_clusters)
prediction = cycle_prediction(S, feature_set, clusters)
stats = prediction_statistics(S, prediction)
P = expand.grid(ncluster=seq(1,35))
D = ddply(P,
.(ncluster),
function(x) { P = cycle_prediction(S, feature_set,
cluster_codelets(S, feature_set, x$ncluster))
cbind(prediction_statistics(S, P),
compute_global_speedup(compute_appli_cycles(S, P)))
}
)
D$benchmarking_speedup = D$appli_benchmark_cost_ms/D$benchmark_cost_ms
D = D[!duplicated(D$number_of_clusters),]
D$ncluster = D$number_of_clusters
D = D[,c("ncluster", "median_per_err", "benchmarking_speedup")]
D = melt(D, id.vars=c("ncluster"))
D$label = c(rep(D[D$variable == "median_per_err" &
D$ncluster==stats$number_of_clusters,]$value,nrow(D)/2),
rep(D[D$variable == "benchmarking_speedup" &
D$ncluster==stats$number_of_clusters,]$value,nrow(D)/2)
)
D$Architecture = as.factor(as.character(arch_names[arch]))
all = rbind(all, D)
}
# Set the plot scales
maxsp = 50
maxer = 25
convert = maxer/maxsp
all[all$variable == "benchmarking_speedup",]$value = all[
all$variable == "benchmarking_speedup",]$value * convert
all[all$variable == "median_per_err",]$value = all[
all$variable == "median_per_err",]$value * 100
all$variable = factor(all$variable, levels=c("median_per_err", "benchmarking_speedup"))
best_c = stats$number_of_clusters
# Plot the speedup & error by increasing number of representatives, group by architecture
trellis.par.set(theme = col.whitebg())
trellis.par.set(fontsize=list(text=10, points=8))
p <- xyplot( value ~ ncluster | Architecture, data = all, groups = variable, type="l",
ylim=c(0, 30), xlim=c(0,25),
between = list(x = 0), scales = list(tck = c(1,0), alternating=1),
xlab="Number of clusters", ylab="Median % error",
par.settings=list(layout.widths=list(right.padding=10)),
auto.key=list(text=c("Median % error","Benchmarking reduction factor"),space="top",
lines=TRUE, points=FALSE, columns=2),
strip=strip.custom(bg="lightgray"),
panel=function(x, y, ...) {
panel.xyplot(x,y, ...)
xscale <- current.viewport()$xscale
if (panel.number() == 3) {
yscale <- current.viewport()$yscale
pushViewport(viewport(width=2, height=2, clip=TRUE))
pushViewport(viewport(width=0.5, height=0.5,
xscale=xscale, yscale=yscale))
at <- pretty(c(0, maxsp))
panel.axis("right", half = FALSE, outside=TRUE,
at = at * convert, labels = at)
panel.text(32, maxer/2, "Benchmarking reduction factor", srt=+90)
popViewport()
popViewport()
}
error = y[best_c]
speed = y[length(y)/2+best_c]/convert
panel.abline(v=best_c, col.line="darkblue", lty=3)
panel.points(best_c, error, pch=1, cex=1.75, col="darkgreen")
panel.points(best_c, speed*convert, pch=1, cex=1.75, col="darkred")
panel.text(best_c-2.5, error-1.5, paste0(signif(error,2), "%"), cex=0.8,
col="darkgreen")
panel.text(best_c-2.5, (speed+2)*convert, paste0("x", signif(speed,2)), cex=0.8,
col="darkred")
})
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 4 -h 3 -u in -r 150
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "comp_random_clustering.pdf", sep="/")
pdf(outputpdf, width=3.7, height=3)
}
data = read.csv(paste(DATA_PATH, "random-1000.csv", sep="/"), header=T)
data$cluster = all[all$variable == "median_per_err",]$value/100
g <- ggplot(data, aes(x=n))
g <- g + geom_line(aes(y=median*100,linetype="B"))
g <- g + geom_line(aes(y=minimum*100,linetype="C"))
g <- g + geom_line(aes(y=maximum*100,linetype="A"))
g <- g + geom_ribbon(data=data,aes(ymin=minimum*100,ymax=maximum*100, fill="Range"),
alpha=0.3, fill="gray")
g <- g + geom_line(aes(y=cluster*100, color="GA features"), size = 1.3)
g <- g + labs(x = "Number of clusters", y = "Median % error")
g <- g + xlim(1,24)
#g <- g + scale_y_continuous(labels = percent_format())
g <- g + guides(col=guide_legend(title=NULL))
g <- g + guides(linetype=guide_legend(title=NULL))
g <- g + facet_wrap(~architecture, ncol=1, scales = "free_y")
g <- g + theme_bw(base_size=9)
g <- g + theme(legend.position = "none")
g <- g + theme(legend.key= element_blank(),legend.position = "right",
legend.direction = "vertical")
g <- g + scale_linetype_manual(values=c(1,2,3), labels=c("Worst", "Median", "Best"))
#g <- g + guides(linetype = guide_legend(nrow = 2))
print(g)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
%%R -w 4 -h 4 -u in -r 150
D = data.frame()
# Limiting our method to extracting representatives only per application"
for (arch in c("atom", "core2", "sandybridge")) {
S = new_session(bench, arch)
prediction = cycle_prediction(S, feature_set, cluster_codelets(S, feature_set, 1))
G = compute_global_speedup(compute_appli_cycles(S, prediction))
print(s)
for (ncluster in seq(1,12)) {
errors = c()
nclusters = 0
time_required = 0
# mg cannot be predicted with codelets from mg, because they are all ill-behaved
# (another disadvantage of being limited to a single application)
# so it's not included.
for (app in c("bt", "cg", "ft", "is", "lu", "sp")) {
S = new_session(bench, arch)
S = keep_only_app(S, app)
# Run the benchmark reduction
clusters = cluster_codelets(S, feature_set, min(ncluster, nrow(S$data)))
prediction = cycle_prediction(S, feature_set, clusters)
errors = c(errors, prediction$per_err)
stats = prediction_statistics(S, prediction)
nclusters = nclusters + max(prediction$Cluster)
time_required = time_required + stats$benchmark_cost_ms
}
D = rbind(D, data.frame(Architecture=as.factor(as.character(arch_names[arch])),
method="Per Application", median_per_err=median(errors)*100,
benchmarking_speedup=G$appli_benchmark_cost_ms/time_required,
ncluster=nclusters))
}
}
if(PDF_OUTPUT_DIR != "") {
outputpdf=paste(PDF_OUTPUT_DIR, "across_vs_single.pdf", sep="/")
pdf(outputpdf, width=3, height=4)
}
E = cast(all[,c("ncluster", "Architecture", "variable", "value")])
E$method = "Across Applications"
print(colnames(E))
print(colnames(D))
data = rbind(E, D)
p = ggplot(data=data, aes(y=median_per_err, x=ncluster, color=method,
linetype=method, shape=method)) + geom_line() + geom_point()
p = p + facet_wrap(~Architecture, ncol=1)
p = p + theme_bw(base_size=9)
p = p + theme(legend.position="top") + theme(legend.key = element_blank())
p <- p + scale_shape_manual("Subsetting", values=c(1,2),
labels=c("Across Applications", "Per Application"))
p <- p + scale_linetype_manual("Subsetting", values=c(1,2),
labels=c("Across Applications", "Per Application"))
p <- p + scale_color_manual("Subsetting", values=colours[1:2],
labels=c("Across Applications", "Per Application"))
p <- p + labs(x = "Number of clusters", y = "Median % error") + xlim(0,25)
print(p)
if(PDF_OUTPUT_DIR != "") { dev.off(); print(paste("Saved to", outputpdf)) }
Codelet clustering is an important part of the method. It enables to keep a single copy among a group of similar codelets. One may wonder if clustering effectively reduces system selection overhead. This is a valid question because the acceleration of our method comes from two factors:
The following experiment examines separately the contributions of these two factors.
%%R
compute_speedup_breakdown <- function(S) {
clusters = cluster_codelets(S, feature_set, number_of_clusters)
prediction = cycle_prediction(S, feature_set, clusters)
stats = prediction_statistics(S, prediction)
speedup = compute_global_speedup(compute_appli_cycles(S, prediction))
without_clustering = sum(S$data$tar_vitro_CPInv*S$data$tar_vitro_invocations/S$tar_clk)
return(c(speedup$appli_benchmark_cost_ms/stats$benchmark_cost_ms,
speedup$appli_benchmark_cost_ms/without_clustering))
}
bench="NAS"
table = data.frame()
for (arch in c("atom", "core2", "sandybridge")) {
architecture=arch
number_of_clusters=-1
S = new_session(bench, architecture)
both = compute_speedup_breakdown(S)
table = rbind(table, data.frame(architecture=arch, with_clusters=both[1],
without_clusters = both[2], ratio=both[1]/both[2]))
}
print(table)
This sections shows how to build the prediction matrix used to transform the representative's measures on a target architecture into the performance estimation of all the codelets.
%%R
# Load session data
S= new_session("NR", "atom")
# Run the benchmark reduction
clusters = cluster_codelets(S, feature_set, number_of_clusters=14)
prediction = cycle_prediction(S, feature_set, clusters)
The matrix \(\textbf{M}\) is defined as:
\[ \mathbf{M_{i,k}} = \left\{ \begin{array}{l l} 0 &\quad \text{ if } p_i \notin C_k\\ \frac{t^{ref}_i}{t^{ref}_{r_k}} &\quad \text{ if } p_i \in C_k \end{array} \right.\ \]
%%R
# Outputs the model matrix
build_pred_matrix <- function(prediction) {
N=nrow(prediction)
K=max(prediction$Cluster)
M=matrix(0,nrow=N, ncol=K)
colnames(M) = seq(max(prediction$Cluster))
rownames(M) = cleanCodeletNames(prediction)$CodeletName
reprs = data.frame()
for (C in seq(max(prediction$Cluster))) {
clus = prediction[prediction$Cluster == C,]
repr = clus[clus$is.representative,]
reprs = rbind(reprs, repr)
t_ref_rk = repr$ref_vivo_CPInv
for (i in seq(nrow(clus))) {
n = as.integer(rownames(clus[i,]))
t_ref_i = clus[i,]$ref_vivo_CPInv
M[n,C] = t_ref_i/t_ref_rk
}
}
return(list(M=M, reprs=reprs))
}
pred = build_pred_matrix(prediction)
pred$M = round(pred$M, 2)
print(as.table(local({pred$M[pred$M == 0] = NA;pred$M})))
We time the 14 representatives on the target architecture and gather the measures in a vector: \(\vec{t^{tar}_{repr}}\)
%%R
t_repr_tar = matrix(pred$reprs$tar_vitro_CPInv)
print(t_repr_tar)
The prediction of all the codelets is computed with the following formula:
\[ \vec{t^{tar}_{all}} = \textbf{M} . \vec{t^{tar}_{repr}} \]
%%R
predicted = pred$M %*% t_repr_tar
print(signif(data.frame(predicted=predicted, real=prediction$tar_vivo_CPInv),2))
List of the original NAS codelets, the file from which they were extracted and the position in the file. The source lines below enclose innermost loop. That is because they are produced during our Static Analysis which focuses on innermost loops. Nevertheless, the codelets that were extracted correspond to the outermost loop enclosing the below source lines.
%%R
# Load session data
S= new_session("NAS", "core2")
#print(colnames(S$features))
print(S$features[,c("CodeletName","BenchName", "Source.file", "Source.lines")])
List of the 18 representatives found with the elbow method (number_of_clusters set to -1)
%%R
clusters = cluster_codelets(S, feature_set, number_of_clusters=-1)
prediction = cycle_prediction(S, feature_set, clusters)
pred = build_pred_matrix(prediction)
print(pred$reprs[, c("CodeletName")])
An archive with the extracted 18 standalone microbenchmarks is available on request.
Prediction matrix:
%%R
print(signif(pred$M))
%%R
#
# This function called on S$data will generate a random clustering of K clusters
#
random_clustering <- function(data, K) {
data$Cluster = rep(1,nrow(data))
# We want exactly K clusters, therefore select randomly K distinct clusters and
# attribute them to clusters 1 to K.
ver = rep(T,nrow(data))
for(i in (1:K))
{
n = which(ver)[round(runif(1,min=1,max=(nrow(data)-i+1)),digits=0)]
data$Cluster[n] <- i
ver[n] <- F
}
# Now label all other codelets with a random cluster between 1 and K
data$Cluster [ver] <- round(runif(nrow(data)-K,min=1,max=K),digits=0)
return (data)
}
Example: generate a random clustering for NR codelets.
%%R
S = new_session("NR", "atom")
cluster = random_clustering(S$data,10)
print(cluster[,c("CodeletName", "Cluster")])
%%R
require(genalg)
require(permute)
bench = "NR"
number_of_clusters = -1 # we use elbow during GA exploration
features = scan(file=paste(DATA_PATH, "features.list", sep="/"), what="character")
S = list("atom" = new_session(bench, "atom"),
"sandybridge" = new_session(bench, "sandybridge"))
eval_arch <- function(clusters, arch, features) {
prediction = cycle_prediction(S[[arch]], features, clusters)
stats = prediction_statistics(S[[arch]], prediction)
return(stats$mean_per_err)
}
eval_func <- function(c) {
# The empty chromosome has an error of 100 percent
if (sum(c) <= 3) {
return(100)
}
select_features = features[as.logical(c)]
clusters = cluster_codelets(S[["sandybridge"]], select_features, number_of_clusters)
sandybridge = eval_arch(clusters, "sandybridge", select_features)
atom_clusters = S[["atom"]]$data
atom_clusters$Cluster = clusters$Cluster
atom = eval_arch(atom_clusters, "atom")
return (max(atom, sandybridge)*max(atom_clusters$Cluster))
}
monitor <- function(obj) {
minEval = min(obj$evaluations)
filter = obj$evaluations == minEval
bestObjectCount = sum(rep(1, obj$popSize)[filter])
# ok, deal with the situation that more than one object is best
if (bestObjectCount > 1) {
bestSolution = obj$population[filter,][1,]
} else {
bestSolution = obj$population[filter,]
}
# plot the population
print(obj$best)
print(obj$mean)
cat(summary.rbga(obj))
print(features[as.logical(bestSolution)])
}
The real GA exploration used in the paper had 1000 individuals and 100 generations. It took more than 12 hours to run and was not run through IPython Notebook. Here we demonstrate with a toy population. Feel free to set the parameters to those of the paper to reproduce the original experiment.
%%R
PopulationSize = 10 # The real value during our experiments in 1000,
# the exploration took more than 12 hours.
Generations = 5 # The real value used was 100
GAmodel <- rbga.bin(size = length(features), popSize = PopulationSize,
iters = Generations, mutationChance = 0.01,
elitism = T, evalFunc = eval_func, monitorFunc = monitor)
plot(GAmodel)
save(GAmodel, file="gamodel.data")
The following cell is used to style this notebook and can be safely ignored.
# Use custom css style for this Notebook
from IPython.core.display import HTML
def css_styling():
styles = open("custom.css", "r").read()
return HTML(styles)
css_styling()