perflab实验报告55931.docx
- 文档编号:15892819
- 上传时间:2023-07-08
- 格式:DOCX
- 页数:15
- 大小:113.46KB
perflab实验报告55931.docx
《perflab实验报告55931.docx》由会员分享,可在线阅读,更多相关《perflab实验报告55931.docx(15页珍藏版)》请在冰点文库上搜索。
perflab实验报告55931
课程实验报告
课程名称:
计算机组成与结构
实验项目名称:
perflab-handout
专业班级:
姓名:
学号:
指导教师:
杨科华
完成时间:
2016年5月27日
信息科学与工程学院
实验题目:
perflab程序性能调优
实验目的:
理解编译器,学习程序优化,从优化程序代码和程序执行速度两方面着手。
实验要求:
本次实验,要求针对每个函数、每个人均至少写出3种优化版本、并根据driver报告的结果进行性能分析
实验环境:
ubuntu-15.10、x32系统、VMwareworkstation
实验内容及操作步骤:
将下载下来的kernels.c中的rotate、smooth函数进行优化。
rotate函数的作用是将图像逆时针旋转90°,smooth函数的作用是对于图像中的每一个像素点,取它和周围的像素点的平均值,让图片变得模糊。
下面对代码进行逐一优化。
源代码的CPE测试:
1.Naive_rotate
1)源代码:
charnaive_rotate_descr[]="naive_rotate:
Naivebaselineimplementation";
voidnaive_rotate(intdim,pixel*src,pixel*dst)
{
inti,j;
for(i=0;i for(j=0;j dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)]; } 2)分析: 这段代码的作用就是用一个双层循环将所有的像素进行行列调位、导致整幅图画进行了90度旋转。 然而分析一下代码就能发现一个十分简单的优化方法: 因为在最内层循环中,j的值每次都会改变,所以每执行一次赋值就要计算一次dim-1-j,算多了自然就慢了。 我们可以利用简单的数学技巧改写公式,将赋值语句改成dst[RIDX(i,j,dim)]=src[RIDX(j,dim-i-1,dim)];这样就不用每次都计算了。 3)优化代码1如下: charnaive_rotate_descr2[]="naive_rotate2: onlychangetheplaceofiandj"; voidnaive_rotate2(intdim,pixel*src,pixel*dst) { inti,j; for(i=0;i for(j=0;j dst[RIDX(i,j,dim)]=src[RIDX(j,dim-i-1,dim)];//ichangeless } 优化结果如下: 这是一种最为简单的优化方案,由图可知,速度提升不大,性能优化结果也不是很好。 再分析源代码,从cache友好性来分析,这个代码的效率机会很低,所以按照cache的大小,应在存储的时候进行32个像素依次存储(列存储)。 做到cache友好这样就可以可以大幅度提高效率。 4)优化代码2如下: charrotate_descr2[]="rotate2: version2 break into 4*4 blocks"; voidrotate2(intdim,pixel*src,pixel*dst) { inti,j,ii,jj; for(ii=0;ii for(jj=0;jj for(i=ii;i for(j=jj;j dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)]; } 优化结果如下: 用分块的方式,进行优化。 将整个程序分成4*4的小块,提高空间局部性 5)优化代码3如下: charrotate_descr3[]="rotate3: version3breakinto32*32blocks"; voidrotate3(intdim,pixel*src,pixel*dst) { inti,j,ii,jj; for(ii=0;ii for(jj=0;jj for(i=ii;i for(j=jj;j dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)]; } 优化结果如下: 分成32*32块,提高空间局部性 6)优化代码4如下: charrotate_descr4[]="rotate4: Currentworkingversion,usingpointerratherthancomputingaddress"; voidrotate4(intdim,pixel*src,pixel*dst) { inti; intj; inttmp1=dim*dim; inttmp2=dim*31; inttmp3=tmp1-dim; inttmp4=tmp1+32; inttmp5=dim+31; dst+=tmp3; for(i=0;i { for(j=0;j { *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; dst++;src+=dim; *dst=*src; src++; src-=tmp2; dst-=tmp5; } src+=tmp2; dst+=tmp4; } } 优化结果如下: 用循环展开,分成32路并行来写 2.Naive_smooth 1)源代码 charnaive_smooth_descr[]="naive_smooth: Naivebaselineimplementation"; voidnaive_smooth(intdim,pixel*src,pixel*dst) { inti,j; for(i=0;i for(j=0;j dst[RIDX(i,j,dim)]=avg(dim,i,j,src); } CPES性能如下: 2)分析 这段代码很多次地调用avg函数,而avg函数内也频繁调用initialize_pixel_sum、accumulate_sum、assign_sum_to_pixel这几个函数,且含有2层for循环。 虽然会以损害程序的模块性为代价,但消除函数调用的时间开销,得到的代码运行速度会快得多。 所以,需要改写代码,不调用avg函数。 3)优化代码1如下: charsmooth_descr1[]="smooth1: withlessfunccallandgrosslysimplifiedcalculationforcentralparts"; voidsmooth1(intdim,pixel*src,pixel*dst) { inti,j,ii,jj; pixel_sumsum; pixelcurrent_pixel,cp; for(j=0;j { dst[RIDX(0,j,dim)]=avg(dim,0,j,src); dst[RIDX(dim-1,j,dim)]=avg(dim,dim-1,j,src); } for(i=0;i { dst[RIDX(i,0,dim)]=avg(dim,i,0,src); dst[RIDX(i,dim-1,dim)]=avg(dim,i,dim-1,src); } for(i=1;i for(j=1;j { sum.red=sum.green=sum.blue=0; for(ii=max(i-1,0);ii<=min(i+1,dim-1);ii++) for(jj=max(j-1,0);jj<=min(j+1,dim-1);jj++) { cp=src[RIDX(ii,jj,dim)]; sum.red+=cp.red; sum.green+=cp.green; sum.blue+=cp.blue; } current_pixel.red=sum.red/9; current_pixel.green=sum.green/9; current_pixel.blue=sum.blue/9; dst[RIDX(i,j,dim)]=current_pixel; } } 优化结果如下: 4)优化代码2如下: charsmooth_descr2[]="smooth2: testversion"; voidsmooth2(intdim,pixel*src,pixel*dst) { inti,j; //nousingavg() //cornersdst[RIDX(0,0,dim)].red=(src[RIDX(0,0,dim)].red+src[RIDX(1,0,dim)].red+src[RIDX(0,1,dim)].red+src[RIDX(1,1,dim)].red)>>2; dst[RIDX(0,0,dim)].blue=(src[RIDX(0,0,dim)].blue+src[RIDX(1,0,dim)].blue+src[RIDX(0,1,dim)].blue+src[RIDX(1,1,dim)].blue)>>2; dst[RIDX(0,0,dim)].green=(src[RIDX(0,0,dim)].green+src[RIDX(1,0,dim)].green+src[RIDX(0,1,dim)].green+src[RIDX(1,1,dim)].green)>>2; dst[RIDX(0,dim-1,dim)].red=(src[RIDX(0,dim-1,dim)].red+src[RIDX(1,dim-1,dim)].red+src[RIDX(0,dim-2,dim)].red+src[RIDX(1,dim-2,dim)].red)>>2; dst[RIDX(0,dim-1,dim)].blue=(src[RIDX(0,dim-1,dim)].blue+src[RIDX(1,dim-1,dim)].blue+src[RIDX(0,dim-2,dim)].blue+src[RIDX(1,dim-2,dim)].blue)>>2; dst[RIDX(0,dim-1,dim)].green=(src[RIDX(0,dim-1,dim)].green+src[RIDX(1,dim-1,dim)].green+src[RIDX(0,dim-2,dim)].green+src[RIDX(1,dim-2,dim)].green)>>2; dst[RIDX(dim-1,0,dim)].red=(src[RIDX(dim-1,0,dim)].red+src[RIDX(dim-2,0,dim)].red+src[RIDX(dim-1,1,dim)].red+src[RIDX(dim-2,1,dim)].red)>>2; dst[RIDX(dim-1,0,dim)].blue=(src[RIDX(dim-1,0,dim)].blue+src[RIDX(dim-2,0,dim)].blue+src[RIDX(dim-1,1,dim)].blue+src[RIDX(dim-2,1,dim)].blue)>>2;dst[RIDX(dim-1,0,dim)].green=(src[RIDX(dim-1,0,dim)].green+src[RIDX(dim-2,0,dim)].green+src[RIDX(dim-1,1,dim)].green+src[RIDX(dim-2,1,dim)].green)>>2; dst[RIDX(dim-1,dim-1,dim)].red=(src[RIDX(dim-1,dim-1,dim)].red+src[RIDX(dim-1,dim-2,dim)].red+src[RIDX(dim-2,dim-1,dim)].red+src[RIDX(dim-2,dim-2,dim)].red)>>2; dst[RIDX(dim-1,dim-1,dim)].blue=(src[RIDX(dim-1,dim-1,dim)].blue+src[RIDX(dim-1,dim-2,dim)].blue+src[RIDX(dim-2,dim-1,dim)].blue+src[RIDX(dim-2,dim-2,dim)].blue)>>2; dst[RIDX(dim-1,dim-1,dim)].green=(src[RIDX(dim-1,dim-1,dim)].green+src[RIDX(dim-1,dim-2,dim)].green+src[RIDX(dim-2,dim-1,dim)].green+src[RIDX(dim-2,dim-2,dim)].green)>>2; //boarder for(i=1;i { dst[RIDX(i,0,dim)].red=(src[RIDX(i,0,dim)].red+src[RIDX(i-1,0,dim)].red+src[RIDX(i-1,1,dim)].red+src[RIDX(i,1,dim)].red+src[RIDX(i+1,0,dim)].red+src[RIDX(i+1,1,dim)].red)/6; dst[RIDX(i,0,dim)].blue=(src[RIDX(i,0,dim)].blue+src[RIDX(i-1,0,dim)].blue+src[RIDX(i-1,1,dim)].blue+src[RIDX(i,1,dim)].blue+src[RIDX(i+1,0,dim)].blue+src[RIDX(i+1,1,dim)].blue)/6; dst[RIDX(i,0,dim)].green=(src[RIDX(i,0,dim)].green+src[RIDX(i-1,0,dim)].green+src[RIDX(i-1,1,dim)].green+src[RIDX(i,1,dim)].green+src[RIDX(i+1,0,dim)].green+src[RIDX(i+1,1,dim)].green)/6; } for(i=1;i { dst[RIDX(i,dim-1,dim)].red=(src[RIDX(i,dim-1,dim)].red+src[RIDX(i-1,dim-1,dim)].red+src[RIDX(i-1,dim-2,dim)].red+src[RIDX(i,dim-2,dim)].red+src[RIDX(i+1,dim-1,dim)].red+src[RIDX(i+1,dim-2,dim)].red)/6; dst[RIDX(i,dim-1,dim)].blue=(src[RIDX(i,dim-1,dim)].blue+src[RIDX(i-1,dim-1,dim)].blue+src[RIDX(i-1,dim-2,dim)].blue+src[RIDX(i,dim-2,dim)].blue+src[RIDX(i+1,dim-1,dim)].blue+src[RIDX(i+1,dim-2,dim)].blue)/6; dst[RIDX(i,dim-1,dim)].green=(src[RIDX(i,dim-1,dim)].green+src[RIDX(i-1,dim-1,dim)].green+src[RIDX(i-1,dim-2,dim)].green+src[RIDX(i,dim-2,dim)].green+src[RIDX(i+1,dim-1,dim)].green+src[RIDX(i+1,dim-2,dim)].green)/6; } for(j=1;j { dst[RIDX(0,j,dim)].red=(src[RIDX(0,j,dim)].red+src[RIDX(0,j-1,dim)].red+src[RIDX(1,j-1,dim)].red+src[RIDX(1,j,dim)].red+src[RIDX(0,j+1,dim)].red+src[RIDX(1,j+1,dim)].red)/6; dst[RIDX(0,j,dim)].blue=(src[RIDX(0,j,dim)].blue+src[RIDX(0,j-1,dim)].blue+src[RIDX(1,j-1,dim)].blue+src[RIDX(1,j,dim)].blue+src[RIDX(0,j+1,dim)].blue+src[RIDX(1,j+1,dim)].blue)/6; dst[RIDX(0,j,dim)].green=(src[RIDX(0,j,dim)].green+src[RIDX(0,j-1,dim)].green+src[RIDX(1,j-1,dim)].green+src[RIDX(1,j,dim)].green+src[RIDX(0,j+1,dim)].green+src[RIDX(1,j+1,dim)].green)/6; } for(j=1;j { dst[RIDX(dim-1,j,dim)].red=(src[RIDX(dim-1,j,dim)].red+src[RIDX(dim-1,j+1,dim)].red+src[RIDX(dim-1,j-1,dim)].red+src[RIDX(dim-2,j,dim)].red+src[RIDX(dim-2,j+1,dim)].red+src[RIDX(dim-2,j-1,dim)].red)/6; dst[RIDX(dim-1,j,dim)].blue=(src[RIDX(dim-1,j,dim)].blue+src[RIDX(dim-1,j+1,dim)].blue+src[RIDX(dim-1,j-1,dim)].blue+src[RIDX(dim-2,j,dim)].blue+src[RIDX(dim-2,j+1,dim)].blue+src[RIDX(dim-2,j-1,dim)].blue)/6; dst[RIDX(dim-1,j,dim)].green=(src[RIDX(dim-1,j,dim)].green+src[RIDX(dim-1,j+1,dim)].green+src[RIDX(dim-1,j-1,dim)].green+src[RIDX(dim-2,j,dim)].green+src[RIDX(dim-2,j+1,dim)].green+src[RIDX(dim-2,j-1,dim)].green)/6; } //common for(i=1;i for(j=1;j dst[RIDX(i,j,dim)].red=(src[RIDX(i,j,dim)].red+src[RIDX(i+1,j,dim)].red+src[RIDX(i-1,j,dim)].red+src[RIDX(i,j-1,dim)].red+src[RIDX(i+1,j-1,dim)].red+src[RIDX(i-1,j-1,dim)].red+src[RIDX(i,j+1,dim)].red+src[RIDX(i+1,j+1,dim)].red+src[RIDX(i-1,j+1,dim)].red)/9; dst[RIDX(i,j,dim)].blue=(src[RIDX(i,j,dim)].blue+src[RIDX(i+1,j,dim)].blue+src[RIDX(i-1,j,dim)].blue+src[RIDX(i,j-1,dim)].blue+src[RIDX(i+1,j-1,dim)].blue+src[RIDX(i-1,j-1,dim)].blue+s
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- perflab 实验 报告 55931