讨论后自己写了一个腐蚀膨胀,开操作:

一、最开始的C++版本windows

int myopen(Mat bwsrc, Mat &dstimg2, int kernelwidth = 2)
{int comparerows = bwsrc.rows;int comparecols = bwsrc.cols;//do erode...Mat dstimg(bwsrc.size(), CV_8UC1, Scalar(0));for (int i = 0; i < comparerows-1; i ++){uchar *currentRow = bwsrc.ptr<uchar>(i);uchar *nextRow = bwsrc.ptr<uchar>(i + 1);for (int j = 0; j < comparecols-1; j ++){int up0 = currentRow[j];int down0 = nextRow[j];int   up1 = currentRow[j + 1];int   down1 = nextRow[j + 1];if (up0 == 255 && up1 == 255 && down0 == 255 && down1 == 255){dstimg.ptr<uchar>(i)[j] = 255;}else{dstimg.ptr<uchar>(i)[j] = 0;}}}//do dilate...//Mat dstimg2(bwsrc.size(), CV_8UC1, Scalar(0));for (int i = 0; i < comparerows-1; i ++){uchar *currentRow = dstimg.ptr<uchar>(i);uchar *nextRow = dstimg.ptr<uchar>(i + 1);for (int j = 0; j < comparecols-1; j ++){int up0 = currentRow[j];int down0 = nextRow[j];int  up1 = currentRow[j + 1];int   down1 = nextRow[j + 1];if (up0 == 0 && up1 == 0 && down0 == 0 && down1 == 0){dstimg2.ptr<uchar>(i)[j] = 0;}else{dstimg2.ptr<uchar>(i)[j] = 255;}}}//count non-zero points...int non_zero_num = 0;for (int i = 0; i < comparerows; i++){uchar *currentrow = dstimg2.ptr<uchar>(i);for (int j = 0; j < comparecols; j++){int value = currentrow[j];if (value != 0){non_zero_num++;}}}return non_zero_num;
}

结果与opencv的一致。

二、OpenCL版本

1,本来我是这样规划的:

但这样写到一半时卡住了,因为始终无法解决“访存合并”的问题。。。

2,后来改了一种写法可以规避“访存合并”的问题:

a,昨晚就写好了,但运行时遇到个问题 简而言之就是:

这样写是OK的  会打印出problem?这个检测语句;但如果改成:

这样写就不行  NDRange会返回 -30!!!

详细来讲就是:这个开操作 我本来是这样写的:

__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows)
{   int mydstImg[1936*1456]={0};int dstimg[1936*1456]={0};int dstimg2[1936*1456]={0};int secondTempSum[1456]={0};for(uint i=get_global_id(1);i<myimgrows;i+=get_global_size(1)){for(uint j=get_global_id(0);j<myimgcols;j+=get_global_size(0)){int rowstart=i*myimgcols*3;int tempb=currentImg[rowstart+j*3];int tempg=currentImg[rowstart+j*3+1];int tempr=currentImg[rowstart+j*3+2];int rgbpixels=tempr+tempg*256+tempb*256*256;uchar rgbelement=csvArray[rgbpixels];if((int)rgbelement>0){mydstImg[i*myimgcols+j]=255;}}}barrier(CLK_LOCAL_MEM_FENCE);//do erode...//__global uchar *dstimg (default is Scalar(0))for(uint currentGroupID=get_global_id(1);currentGroupID<myimgrows-1;currentGroupID+=get_global_size(1)){uint rowposition=currentGroupID*myimgcols+get_global_id(0);for(;rowposition<myimgcols-1;rowposition+=get_global_size(0)){__private int erodetempValues[4];for(int t=0;t<2;t++){erodetempValues[t]=mydstImg[rowposition+t*myimgcols];}for(int t=2;t<4;t++){erodetempValues[t]=mydstImg[rowposition+(t-2)*myimgcols+1];}if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){dstimg[rowposition]=1;//if need the img ,it is should be changed to 255.}}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);//do dilate...for(int currentGroupID=get_group_id(1);currentGroupID<myimgrows-1;currentGroupID+=get_num_groups(1)){int rowposition=currentGroupID*myimgcols+get_global_id(0);for(;rowposition<myimgcols-1;rowposition+=get_global_size(0)){__private int dilatetempValues[4];for(int t=0;t<2;t++){dilatetempValues[t]=dstimg[rowposition+t*myimgcols];}for(int t=2;t<4;t++){dilatetempValues[t]=dstimg[rowposition+(t-2)*myimgcols+1];}if(dilatetempValues[0]==1 || dilatetempValues[1]==1 || dilatetempValues[2]==1 || dilatetempValues[3]==1){dstimg2[rowposition]=1;}}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);//if need the fluore-img ,you need return dstimg2 here.//do count the fluore of the image after open operation...//__global secondTempSum[myimgrows];for(uint i=get_global_id(1);i<myimgrows;i+=get_global_size(1)){for(uint j=myimgcols/2;j>0;j/=2){if(get_global_id(0)<j){dstimg2[i*myimgcols+get_global_id(0)]+=dstimg2[i*myimgcols+get_global_id(0)+j];}barrier(CLK_LOCAL_MEM_FENCE);}if(get_global_id(0)==0){secondTempSum[i]=dstimg2[i*myimgcols];}barrier(CLK_LOCAL_MEM_FENCE);}//the last Sum:secondTempSum[0]...if(get_global_id(1)==0){for(uint j=myimgrows/2;j>0;j/=2){if(get_global_id(0)<j){secondTempSum[get_global_id(0)]+=secondTempSum[get_global_id(0)+j];}barrier(CLK_LOCAL_MEM_FENCE);}//if(get_global_id(0)==0)//{//    fluorecountResult=secondTempSum[0];//}}if(get_global_id(0)==0 && get_global_id(1)==0){printf("fluore points of the  image: %d \n",secondTempSum[0]);}barrier(CLK_LOCAL_MEM_FENCE);}

但在腐蚀中这句就会报 -30的错:

if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){dstimg[rowposition]=1;}

百思不得其解?

3,大神说我的腐蚀部分对于一个group需要的只是两行的数据而已,故可以更简单更快:

按照他的建议,我写成了:

//size_t localsize[2]={1024,1};
//  size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum)
{   __local int mydstImg[1936*3]; __local int erodeImg[1936*2];     //={0}; //local variable can not be inited like this....__local int dilateImg[1936];     //={0};//__global int tempSum[1455];     //exclusive the last row...int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);for(;currentGroupID<myimgrows-1;currentGroupID+=get_num_groups(0)*get_num_groups(1)){//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0)){int tempb=currentImg[currentGroupID*myimgcols+i*3];int tempg=currentImg[currentGroupID*myimgcols+i*3+1];int tempr=currentImg[currentGroupID*myimgcols+i*3+2];int rgbpixels=tempr+tempg*256+tempb*256*256;uchar rgbelement=csvArray[rgbpixels];if((int)rgbelement>0){mydstImg[i]=255;}else{mydstImg[i]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step2:erode...for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0)){if((j==myimgcols-1) || (j==myimgcols*2-1)){erodeImg[j]=0;break;      //??????????????????????}__private int erodetempValues[4]={0}; //private array can be inited like this??for(int t=0;t<2;t++){erodetempValues[t]=mydstImg[j+t*myimgcols];}for(int t=2;t<4;t++){erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];}if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){erodeImg[j]=1;}else{erodeImg[j]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step3:dilate...for(uint j=get_global_id(0);j<myimgcols;j+=get_local_size(0)){if(j==myimgcols-1){dilateImg[j]=0;break;      //??????????????????????}__private int dilatetempValues[4]={0};for(int t=0;t<2;t++){dilatetempValues[t]=erodeImg[j+t*myimgcols];}for(int t=2;t<4;t++){dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];}if(dilatetempValues[0]==1 || dilatetempValues[1]==1 || dilatetempValues[2]==1 || dilatetempValues[3]==1){dilateImg[j]=1;}else{dilateImg[j]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...for(uint stride=myimgcols/2;stride>0;stride/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<stride){dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];}}if(get_local_id(0)==0){tempSum[currentGroupID]=dilateImg[0];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE???? //step5:count the last fluore sum...if(get_global_id(1)==0){for(uint j=(myimgrows-1)/2;j>0;j/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<j){tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];}}if(get_local_id(0)==0){printf("fluore points of the  image: %d \n",tempSum[0]+tempSum[1454]);}barrier(CLK_LOCAL_MEM_FENCE);}}

但这个结果是无意义的数。。。我好像知道是哪里错了  修改后:

//size_t localsize[2]={1024,1};
//  size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum)
{   __local int mydstImg[1936*3]; __local int erodeImg[1936*2];     //={0}; //local variable can not be inited like this....__local int dilateImg[1936];     //={0};//__global int tempSum[1454];     //exclusive the last row...int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1)){//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0)){int tempb=currentImg[currentGroupID*myimgcols+i*3];int tempg=currentImg[currentGroupID*myimgcols+i*3+1];int tempr=currentImg[currentGroupID*myimgcols+i*3+2];int rgbpixels=tempr+tempg*256+tempb*256*256;uchar rgbelement=csvArray[rgbpixels];if((int)rgbelement>0){mydstImg[i]=255;}else{mydstImg[i]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step2:erode...for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0)){if((j==myimgcols-1) || (j==myimgcols*2-1)){erodeImg[j]=0;break;      //??????????????????????}__private int erodetempValues[4]={0}; //private array can be inited like this??for(int t=0;t<2;t++){erodetempValues[t]=mydstImg[j+t*myimgcols];}for(int t=2;t<4;t++){erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];}if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){erodeImg[j]=255;}else{erodeImg[j]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step3:dilate...for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0)){if(j==myimgcols-1){dilateImg[j]=0;break;      //??????????????????????}__private int dilatetempValues[4]={0};for(int t=0;t<2;t++){dilatetempValues[t]=erodeImg[j+t*myimgcols];}for(int t=2;t<4;t++){dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];}if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255){dilateImg[j]=1;}else{dilateImg[j]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...for(uint stride=myimgcols/2;stride>0;stride/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<stride){dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];}}if(get_local_id(0)==0){tempSum[currentGroupID]=dilateImg[0];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE???? //step5:count the last fluore sum...tempSum[0]if(get_global_id(1)==0){for(uint j=1454/2;j>0;j/=2){if(get_local_id(0)<j){tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];}barrier(CLK_LOCAL_MEM_FENCE);}//if(get_local_id(0)==0)//{//   printf("fluore-points-last: %d \n",tempSum[0]);//}//barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);}

main.cpp:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_god.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}int *sumMap=(int*)malloc(sumsize);status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;return(EXIT_FAILURE);}cout<<"fluore result:"<<sumMap[0]<<endl;tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;free(sumMap);return 0;
}

但结果有点不稳定,出现过一次6,另外再试N次又都是稳定的1311????可能是我cl文件中疑惑的那处造成的?!先不管结果正不正确,比opencv的open()和在host端计算点数之和减速了1ms!!!

但这个不稳定就证明还有问题 我想想。。。
于是我准备将cl部分腐蚀和膨胀的图片传出来看看:

//size_t localsize[2]={1024,1};
//  size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{   __local int mydstImg[1936*3]; __local int erodeImg[1936*2];     //={0}; //local variable can not be inited like this....__local int dilateImg[1936];     //={0};//__global int tempSum[1454];     //exclusive the last row...int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1)){//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0)){int tempb=currentImg[currentGroupID*myimgcols+i*3];int tempg=currentImg[currentGroupID*myimgcols+i*3+1];int tempr=currentImg[currentGroupID*myimgcols+i*3+2];int rgbpixels=tempr+tempg*256+tempb*256*256;uchar rgbelement=csvArray[rgbpixels];if((int)rgbelement>0){mydstImg[i]=255;}else{mydstImg[i]=0;}}barrier(CLK_LOCAL_MEM_FENCE);//step2:erode...for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0)){if((j==myimgcols-1) || (j==myimgcols*2-1)){erodeImg[j]=0;break;      //????????}__private int erodetempValues[4]={0}; //private array can be inited like this??for(int t=0;t<2;t++){erodetempValues[t]=mydstImg[j+t*myimgcols];}for(int t=2;t<4;t++){erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];}if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){erodeImg[j]=255;}else{erodeImg[j]=0;}if(get_local_id(0)<myimgcols){testErodeImg[currentGroupID*myimgcols+j]=erodeImg[j];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);//step3:dilate...for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0)){if(j==myimgcols-1){dilateImg[j]=0;break;      //??????????????????????}__private int dilatetempValues[4]={0};for(int t=0;t<2;t++){dilatetempValues[t]=erodeImg[j+t*myimgcols];}for(int t=2;t<4;t++){dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];}if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255){dilateImg[j]=255;}else{dilateImg[j]=0;}testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];}barrier(CLK_LOCAL_MEM_FENCE);//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...for(uint stride=myimgcols/2;stride>0;stride/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<stride){dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];}}if(get_local_id(0)==0){tempSum[currentGroupID]=dilateImg[0];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE???? //step5:count the last fluore sum...tempSum[0]if(get_global_id(1)==0){for(uint j=1454/2;j>0;j/=2){if(get_local_id(0)<j){tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];}barrier(CLK_LOCAL_MEM_FENCE);}//if(get_local_id(0)==0)//{//    printf("fluore-points-last: %d \n",tempSum[0]);//}//barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);}

main.cpp:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_god.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);int zero = 0;status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem),  (void*)&erodeImg4test_buffer);status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem),  (void*)&dilateImg4test_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}int *sumMap=(int*)malloc(sumsize);status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;return(EXIT_FAILURE);}cout<<"fluore result:"<<sumMap[0]<<endl;uchar *hostErode=NULL;hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);uchar *hostDilate=NULL;hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;free(sumMap);clReleaseMemObject(erodeImg4test_buffer);clReleaseMemObject(dilateImg4test_buffer);return 0;
}

但结果是:

但原图和opencv开之和的图是这样子的:

我将我的中间结果腐蚀和膨胀的图片 放大看,是看到有膨胀效果的,证明没有编写错,根据腐蚀的结果图基础上进行膨胀。那么就是腐蚀那里错了?

结果一步步找,发现最开始根据那个csv文件得到每个像素点是0还是255生成原始待处理图片那里就错了:上周我是这样写的:

上周这样写没错,返回的mydstImg这个图片是正确的;但今天改成了下面这样,返回的就是错的了:

我屏蔽了后面的腐蚀膨胀和计数。但这里返回的图片是错的,与上周对比,为什么?

另外,大神跟我将了bit map/bit mask

位图是有效的降低图像处理中的掩盖层使用量的方法。  原子置位我之前没听过,等搞完这个问题我去查下,看有没例子。这个技巧可以直接从int类型,降低到1/32 (~ 3%)的存储器使用量的!!!!

4,终于知道了问题所在,拿三行的那里应该*3通道的,我眼瞎:

修改后:将腐蚀和膨胀的图返回并与OpenCV的结果图对比一致,肉眼上一致 !!还没具体算点数:

__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{   __local uchar mydstImg[1936*3]; __local uchar erodeImg[1936*2];     //={0}; //local variable can not be inited like this....__local int dilateImg[1936];     //={0};int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1)){//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0)){int tempb=currentImg[currentGroupID*myimgcols*3+i*3];int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];int rgbpixels=tempr+tempg*256+tempb*256*256;uchar rgbelement=csvArray[rgbpixels];if((int)rgbelement>0){mydstImg[i]=255;}else{mydstImg[i]=0;}}//step2:erode...for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0)){if((j==myimgcols-1) || (j==myimgcols*2-1)){erodeImg[j]=0;break;      }__private int erodetempValues[4]={0}; //private array can be inited like this??for(int t=0;t<2;t++){erodetempValues[t]=(int)mydstImg[j+t*myimgcols];}for(int t=2;t<4;t++){erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];}if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255){erodeImg[j]=255;}else{erodeImg[j]=0;}if(get_local_id(0)<myimgcols){testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);//step3:dilate...for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0)){if(j==myimgcols-1){dilateImg[j]=0;break;      }__private int dilatetempValues[4]={0};for(int t=0;t<2;t++){dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];}for(int t=2;t<4;t++){dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];}if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255){dilateImg[j]=255;}else{dilateImg[j]=0;}testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);
}

main.cpp:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_test.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);int zero = 0;status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem),  (void*)&erodeImg4test_buffer);status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem),  (void*)&dilateImg4test_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}//int *sumMap=(int*)malloc(sumsize);//status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);//status=clFinish(queue);//if (status != CL_SUCCESS)// {//   cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;//    return(EXIT_FAILURE);// }//cout<<"fluore result:"<<sumMap[0]<<endl;uchar *hostErode=NULL;hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);uchar *hostDilate=NULL;hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;//free(sumMap);clReleaseMemObject(erodeImg4test_buffer);clReleaseMemObject(dilateImg4test_buffer);return 0;
}

这个返回的先腐蚀后膨胀后的图基本与opencv下一致,我待会儿算一下点数,就知道是否真的一致了!刚返回host端算点是正确的,但我想在求和算点在kernel上算,因为快些。但是在kernel端的结果竟然是错的:

//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...//1: correct...int rowSum=0;if(get_local_id(0)==0){for(uint t=0;t<myimgcols;t++){rowSum+=dilateImg[t];}}barrier(CLK_LOCAL_MEM_FENCE);//2: false...why?????for(uint stride=myimgcols/2;stride>0;stride/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<stride){dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];}}if(get_local_id(0)==0){printf("false: %d  correct:%d\n",dilateImg[0],rowSum);}barrier(CLK_LOCAL_MEM_FENCE);

这两种写法,为什么第二种的结果和第一种会不一样呢?

位置是正确的,但数据怎么计算得不一样呢???????
终于知道原因了 ,因为myimgcols=1936,不是2的整数次幂,故规约法求和时漏掉了4个数:

把这4个加上就好了!!!

5,初步正确的OpenCL开操作版本:

__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,  __global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{     __local uchar mydstImg[1936*3];   __local uchar erodeImg[1936*2];     //={0}; //local variable can not be inited like this....  __local int dilateImg[1936];     //={0};  int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);  for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1))  {  //step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...  for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))  {  int tempb=currentImg[currentGroupID*myimgcols*3+i*3];  int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];  int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];  int rgbpixels=tempr+tempg*256+tempb*256*256;  uchar rgbelement=csvArray[rgbpixels];  if((int)rgbelement>0)  {  mydstImg[i]=255;  }  else  {  mydstImg[i]=0;  }  }  //step2:erode...  for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))  {  if((j==myimgcols-1) || (j==myimgcols*2-1))  {  erodeImg[j]=0;  break;        }  __private int erodetempValues[4]={0}; //private array can be inited like this??  for(int t=0;t<2;t++)  {  erodetempValues[t]=(int)mydstImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];  }  if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)  {  erodeImg[j]=255;  }  else  {  erodeImg[j]=0;  }  if(get_local_id(0)<myimgcols)  {  testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];  }  barrier(CLK_LOCAL_MEM_FENCE);  }  barrier(CLK_LOCAL_MEM_FENCE);  //step3:dilate...  for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))  {  if(j==myimgcols-1)  {  dilateImg[j]=0;  break;        }  __private int dilatetempValues[4]={0};  for(int t=0;t<2;t++)  {  dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];  }  if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)  {  dilateImg[j]=1;  }  else  {  dilateImg[j]=0;  }  testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];  }  barrier(CLK_LOCAL_MEM_FENCE);  //step4:count the every fluore sum of every row (in charge of every group)of the dilate image...  for(uint stride=myimgcols/2;stride>0;stride/=2)  {  barrier(CLK_LOCAL_MEM_FENCE);  if(get_local_id(0)<stride)  {  dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];  }  }  if(get_local_id(0)==0)  {  tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];//printf("false: %d  correct:%d\n",tempSum[currentGroupID],rowSum);  }  barrier(CLK_LOCAL_MEM_FENCE);  }  //barrier(CLK_LOCAL_MEM_FENCE);  //CLK_LOCAL_MEM_FENCE can not help here!!!!!!result in the unstable...//step5:count the last fluore sum...tempSum[0]if(get_global_id(1)==0){for(uint j=1454/2;j>0;j/=2){barrier(CLK_LOCAL_MEM_FENCE);if(get_local_id(0)<j){tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];}}if(get_local_id(0)==0){printf("kernel-points-last: %d\n",tempSum[0]+tempSum[4]+tempSum[10]+tempSum[44]+tempSum[180]+tempSum[362]+tempSum[726]);}barrier(CLK_LOCAL_MEM_FENCE);}barrier(CLK_LOCAL_MEM_FENCE);}  

main.cpp:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);int zero = 0;status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem),  (void*)&erodeImg4test_buffer);status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem),  (void*)&dilateImg4test_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}//int *sumMap=(int*)malloc(sumsize);//status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);//status=clFinish(queue);//if (status != CL_SUCCESS)// {//   cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;//   return(EXIT_FAILURE);// }//cout<<"fluore result:"<<sumMap[0]<<endl;uchar *hostErode=NULL;hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);uchar *hostDilate=NULL;hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;//free(sumMap);clReleaseMemObject(erodeImg4test_buffer);clReleaseMemObject(dilateImg4test_buffer);return 0;
}

结果是4139个点!与opencv的开结果一致!

但有个偶然性:什么都没改  第一次和后面很多次的结果不一样。

可能是同步的问题
因为有个地方我没处理好
我想等所有groups运行完  再将这些groups的结果加起来
但这没办法等待的!!!!!!所以造成了第一次的结果是乱七八糟的数,后面再次运行都正确!

6,稳定的OpenCL版本

我将无法同步的最后的计数部分返回到host端计数,这样就相当于一种同步了,测试了每次的结果都正确。

__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,  __global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{     __local uchar mydstImg[1936*3];   __local uchar erodeImg[1936*2];     //={0}; //local variable can not be inited like this....  __local int dilateImg[1936];     //={0};  int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);  for(;currentGroupID<myimgrows-2;currentGroupID+=get_num_groups(0)*get_num_groups(1))  {  //step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...  for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))  {  int tempb=currentImg[currentGroupID*myimgcols*3+i*3];  int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];  int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];  int rgbpixels=tempr+tempg*256+tempb*256*256;  uchar rgbelement=csvArray[rgbpixels];  if((int)rgbelement>0)  {  mydstImg[i]=255;  }  else  {  mydstImg[i]=0;  }  }  //step2:erode...  for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))  {  if((j==myimgcols-1) || (j==myimgcols*2-1))  {  erodeImg[j]=0;  break;        }  __private int erodetempValues[4]={0}; //private array can be inited like this??  for(int t=0;t<2;t++)  {  erodetempValues[t]=(int)mydstImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];  }  if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)  {  erodeImg[j]=255;  }  else  {  erodeImg[j]=0;  }  if(get_local_id(0)<myimgcols)  {  testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];  }  barrier(CLK_LOCAL_MEM_FENCE);  }  barrier(CLK_LOCAL_MEM_FENCE);  //step3:dilate...  for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))  {  if(j==myimgcols-1)  {  dilateImg[j]=0;  break;        }  __private int dilatetempValues[4]={0};  for(int t=0;t<2;t++)  {  dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];  }  if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)  {  dilateImg[j]=1;  }  else  {  dilateImg[j]=0;  }  testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];  }  barrier(CLK_LOCAL_MEM_FENCE);  //step4:count the every fluore sum of every row (in charge of every group)of the dilate image...  for(uint stride=myimgcols/2;stride>0;stride/=2)  {  barrier(CLK_LOCAL_MEM_FENCE);  if(get_local_id(0)<stride)  {  dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];  }  }  if(get_local_id(0)==0)  {  tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];//printf("correct: %d  ID:%d\n",tempSum[currentGroupID],currentGroupID);  }  barrier(CLK_LOCAL_MEM_FENCE);  }  //Above all is correct...}  

main.cpp:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY| CL_MEM_ALLOC_HOST_PTR, sumsize, NULL,&status);cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);int zero = 0;status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem),  (void*)&erodeImg4test_buffer);status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem),  (void*)&dilateImg4test_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}int *sumMap=NULL;sumMap=(int*)clEnqueueMapBuffer(queue,sumArray_buffer,CL_TRUE, CL_MAP_READ, 0, sumsize, 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, sumArray_buffer, (void*)sumMap, 0, NULL, NULL);int finalSum=0;for(int j=0;j<1454;j++){finalSum+=sumMap[j];//cout<<"ID:  "<<j<<"--Value: "<<sumMap[j]<<endl;}cout<<"host fluore result:"<<finalSum<<endl;uchar *hostErode=NULL;hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);uchar *hostDilate=NULL;hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;//free(sumMap);clReleaseMemObject(erodeImg4test_buffer);clReleaseMemObject(dilateImg4test_buffer);return 0;
}

当然这里方便计数我在kernel中将膨胀后的结果255改成了1。反正不影响计数,如果要看膨胀后的图片,用255就好了。

7,最终的OpenCL-Open()版本

__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows, __global int *tempSum)
{     __local uchar mydstImg[1936*3];   __local uchar erodeImg[1936*2];     //={0}; //local variable can not be inited like this....  __local int dilateImg[1936];     //={0};  int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);  for(;currentGroupID<myimgrows-2;currentGroupID+=get_num_groups(0)*get_num_groups(1))  {  //step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...  for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))  {  int tempb=currentImg[currentGroupID*myimgcols*3+i*3];  int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];  int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];  int rgbpixels=tempr+tempg*256+tempb*256*256;  uchar rgbelement=csvArray[rgbpixels];  if((int)rgbelement>0)  {  mydstImg[i]=255;  }  else  {  mydstImg[i]=0;  }  }  barrier(CLK_LOCAL_MEM_FENCE);  //step2:erode...  for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))  {  if((j==myimgcols-1) || (j==myimgcols*2-1))  {  erodeImg[j]=0;  break;        }  __private int erodetempValues[4]={0}; //private array can be inited like this??  for(int t=0;t<2;t++)  {  erodetempValues[t]=(int)mydstImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];  }  if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)  {  erodeImg[j]=255;  }  else  {  erodeImg[j]=0;  }  }  barrier(CLK_LOCAL_MEM_FENCE);  //step3:dilate...  for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))  {  if(j==myimgcols-1)  {  dilateImg[j]=0;  break;        }  __private int dilatetempValues[4]={0};  for(int t=0;t<2;t++)  {  dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];  }  for(int t=2;t<4;t++)  {  dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];  }  if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)  {  dilateImg[j]=1;  }  else  {  dilateImg[j]=0;  }  }  barrier(CLK_LOCAL_MEM_FENCE);  //step4:count the every fluore sum of every row (in charge of every group)of the dilate image...  for(uint stride=myimgcols/2;stride>0;stride/=2)  {  barrier(CLK_LOCAL_MEM_FENCE);  if(get_local_id(0)<stride)  {  dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];  }  }  if(get_local_id(0)==0)  {  tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];//printf("correct: %d  ID:%d\n",tempSum[currentGroupID],currentGroupID);  }  barrier(CLK_LOCAL_MEM_FENCE);  }  //Above all is correct...
}  

main.cpp部分:

int main()
{char filename[100];cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t length = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);status=clBuildProgram(program,1,&device,NULL,NULL,NULL);if (status != CL_SUCCESS){shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);oclLogBuildInfo(program, oclGetFirstDev(context));oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");return(EXIT_FAILURE);}//get the csv model from the disk.const int rgbsize=256*256*256;uchar* rgbarray = new uchar[rgbsize];memset(rgbarray, 0, rgbsize * sizeof(uchar));Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);Mat csvimg2=ygdata->getSamples();int csvrows=csvimg2.rows; //csv points//cout<<"primer csvrows:"<<csvrows<<endl;for (int j = 0; j < csvrows; j++){float* pixeldata = csvimg2.ptr<float>(j);float x = pixeldata[0];float y = pixeldata[1];float z = pixeldata[2];int newindex = x + y * 256 + z * 256 * 256;rgbarray[newindex] = 255;}TickMeter tm;tm.start();//get the src images from the disk.int imgwidth,imgheight;int ii=817;sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);Mat srcimg=imread(filename);imgheight=srcimg.rows;imgwidth=srcimg.cols;int pixels=imgheight*imgwidth;int srcdatasize=pixels*3*sizeof(uchar);cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);cl_mem  srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);size_t sumsize=1454*sizeof(int);cl_mem  sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY| CL_MEM_ALLOC_HOST_PTR, sumsize, NULL,&status);cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int),  &imgwidth);status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int),  &imgheight);status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem),  (void*)&sumArray_buffer);size_t localsize[2]={1024,1};size_t globalsize[2]={1024,1024};status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);if (status != CL_SUCCESS){cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;return(EXIT_FAILURE);}status=clFinish(queue);if (status != CL_SUCCESS){cout<<"clFinish() failed..."<<status<<endl;return(EXIT_FAILURE);}int *sumMap=NULL;sumMap=(int*)clEnqueueMapBuffer(queue,sumArray_buffer,CL_TRUE, CL_MAP_READ, 0, sumsize, 0, NULL, NULL, &status);clEnqueueUnmapMemObject(queue, sumArray_buffer, (void*)sumMap, 0, NULL, NULL);int finalSum=0;for(int j=0;j<1454;j++){finalSum+=sumMap[j];//cout<<"ID:  "<<j<<"--Value: "<<sumMap[j]<<endl;}cout<<"host fluore result:"<<finalSum<<endl;tm.stop();cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseProgram(program);clReleaseKernel(kernel_imgProc);clReleaseMemObject(srcdata_buffer);clReleaseMemObject(rgbArray_buffer);delete [] rgbarray;//free(sumMap);return 0;
}

测试结果对比:

8,kernel与二进制

按照 http://www.cnblogs.com/mikewolf2002/archive/2012/09/06/2674125.html 这个大神的,将kernel转成二进制.bin文件。下载http://www.cnblogs.com/mikewolf2002/archive/2012/09/06/2674125.html 他的gclFile.h和gclFile.cpp文件即可。

char **binaries = (char **)malloc( sizeof(char *) * 1 ); //只有一个设备size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * 1 );status = clGetProgramInfo(program,CL_PROGRAM_BINARY_SIZES,sizeof(size_t) * 1,binarySizes, NULL);binaries[0] = (char *)malloc( sizeof(char) * binarySizes[0]);status = clGetProgramInfo(program,CL_PROGRAM_BINARIES,sizeof(char *) * 1, binaries,NULL);    kernelFile.writeBinaryToFile("vecadd.bin", binaries[0],binarySizes[0]);

然后下次将.bin加载进OpenCL工程:

gclFile kernelFile;if(!kernelFile.readBinaryFromFile("fluore_open.bin")){printf("can not load the kernel file.\n");}const char *binary=kernelFile.source().c_str();size_t binarySize=kernelFile.source().size();cl_program program=clCreateProgramWithBinary(context,1,&device,(const size_t*)&binarySize,(const unsigned char**)&binary,NULL,NULL);

其它都没什么区别。已经测试过 正确。

心情好,臭美一下:

膨胀腐蚀-OpenCL加速及kernel变成二进制文件相关推荐

  1. opencl 加速 c语言程序_在AlveoU200加速卡上实现简单手写数字识别

    最近实验室租了块xilinx家的AlveoU200加速卡,过去几天被这块板吸引了注意力.刚开始了解,做点什么来试试水呢?一想,可以把曾经学 @蔡宇杰 大佬在pynq-z2上做的那个手写数字识别工程在这 ...

  2. 膨胀腐蚀-利用直线模板和圆模板及python-opencv代码实现

    也是这几天做项目才理解到,原来膨胀腐蚀还可以利用线条模板和圆的模板去做,之前都是使用矩形模板去做的. 先介绍一下opencv中膨胀腐蚀经常用到的函数: cv2.erode()/cv2.dilate() ...

  3. 膨胀腐蚀操作(MATLAB)

    %膨胀腐蚀操作 clear,clc,close all; Image = rgb2gray(imread('pic01.bmp')); figure,imshow(Image),title('原图') ...

  4. Halcon膨胀腐蚀

    简介: 膨胀可以增大区域,填充内部孔和增强了离散点,但增大了面积.腐蚀可以缩小区域,减少间隔区域的连接和离散点的数量,但减小了面积. 开运算是先腐蚀后膨胀,有益于消除线和点区域,减少干扰.闭运算是先膨 ...

  5. opencv-python数图实验(三)膨胀腐蚀、击中

    1.基础实验部分 1. 了解膨胀.腐蚀原理 2. 了解开运算.闭运算原理 3. 掌握膨胀.腐蚀函数的使用 4. 掌握开运算.闭运算函数的使用 实现效果 开运算.闭运算 腐蚀.膨胀 二.思考题 2.1: ...

  6. Matlab利用膨胀腐蚀计算硬币数目

    Matlab利用膨胀腐蚀计算硬币数目 这段时间一直在学习Matlab图像处理,浅浅的记录一下一个简单的膨胀腐蚀算法. sample04.jpg 代码如下: %%计算图中硬币的数量I=imread('s ...

  7. unity解码4k图片过慢,使用turbojpeg加速,使用opencl加速,使用libjpeg,使用v4l2

    后记: 果然贻笑大方,libjpeg也不记得哪个文章说最新版是1996年的,还煞有介事的给了一个1996年以后再没更新过的所谓官网,实际上libjpeg一直有维护和更新,支持到的最新版vs比我用的还要 ...

  8. opencv膨胀腐蚀

    OpenCV 是一个开源的计算机视觉库,它包含了许多图像处理的功能,其中膨胀和腐蚀是两种常用的形态学操作. 膨胀(Dilation):膨胀操作是将图像中的高亮区域(白色像素)扩张,从而填充低亮区域(黑 ...

  9. linux下cpu opencl加速,GPU挑战CPU!详解CUDA+OpenCL威力

    众所周知,GPU拥有数十倍于CPU的浮点运算能力,但如此强大的实力多数情况下只能用来玩游戏,岂不可惜?因此近年来业界都在致力于发掘GPU的潜能,让它能够在非3D.非图形领域大展拳脚. 1999年,首颗 ...

最新文章

  1. CBC2020 第五届中国计算机学会生物信息学会议(The Fifth CCF Bioinformatics Conference,简称CBC 2020) 2020年10月16日-10月18日在哈尔滨
  2. python爬取小说写入txt_对新笔趣阁小说进行爬取,保存和下载!这就是Python的魅力...
  3. illustrator条形码_Barcode Producer for Mac(创建条形码软件)
  4. Pwn环境配置(二)——VMware虚拟机安装Ubuntu 16.04系统
  5. Mint-ui中loadmore(上拉加载下拉刷新)组件在ios中滑动会触发点击事件的解决方法...
  6. Lombok的使用方法
  7. PHP函数调用的新的用法
  8. java使用JMail通过QQ邮件服务器实现自动发送邮件
  9. 手把手带你走进卷积神经网络!
  10. iterator adapter reverse_iterator
  11. Leetcode513. Find Bottom Left Tree Value找树左下角的值
  12. 阶段3 2.Spring_03.Spring的 IOC 和 DI_6 spring中bean的细节之三种创建Bean对象的方式
  13. vscode 插件: REST Client 介绍
  14. Ubuntu 实现shell文件的开机运行(从原理到实现)
  15. 计算机青蓝云题库,计算机三级上机题库 计算机三级网络技术上机题库《南开100题》.doc...
  16. avi格式如何转换成mp4格式
  17. TJA1043 CanTrcv
  18. Windows“启动”文件夹
  19. Instruction Set Principles
  20. CorelDraw2022完整版图形平面设计

热门文章

  1. 【语义分割】2022-HRViT CVPR
  2. phpcustom数据库录入
  3. 阿里云云栖社区合作指南
  4. nokogiri 足球比赛数据
  5. 数值分析 python_数值分析实验之矩阵特征值(Python代码)
  6. java 存储空间 简单分析
  7. python提示IndentationError: unexpected indent错误
  8. 靠谱的动漫培训班怎么选
  9. (Router)路由交换实验
  10. ChatGPT能取代多少程序员?