IMX8M系列 OpenCL FFT 示例编译测试及其他demo测试(MYD-JX8MX)

上一篇文章已经将如何编译镜像,如何用官方的方式FslBuild.py 脚本编译demo。不知道有没有成功的朋友,如果你成功了,那么可以依然用这种方式,如果你没有成功,那么你可以参考接下来的方式来编译你的demo

导出文件系统

上一篇文章有描述如何导出文件系统,这里的目的是为了提供编译所需的头文件和库文件等
具体方法如下:

#进入源码root目录
cd fsl-release-yocto
#执行设置环境变量的脚本
. ./setup-environment build-xwayland
#编译工具链
bitbake meta-toolchain
#安装工具链
cd ~/fsl-release-yocto/build-xwayland/tmp/deploy/sdk
./fsl-imx-xwayland-glibc-x86_64-meta-toolchain-aarch64-toolchain-4.9.88-2.0.0.sh
./opt/fsl-imx-xwayland/4.19-warrior/environment-setup-aarch64-poky-linux
#提取文件系统
runqemu-extract-sdk ~/fsl-release-yocto/build-xwayland/tmp/deploy/images/imx8mqevk/fsl-image-qt5-validation-imx-imx8mqevk-20210809020904.rootfs.tar.bz2 ~/imx8mqevk-rootfs

提取好的文件系统在~/imx8mqevk-rootfs 目录下
具体内容如下:

该文件系统时利用交叉编译方式编译出来的,有我们所需的头文件和库等。

编译FFT

这里我利用gtec-demo-frameworkNXP提供的示例来单独编译FFT用于简单的测试其GPU的性能

main.cpp


#include "clutil.h"int
main(const int argc,const char* argv[])
{if (argc < 2){printf("Usage: %s fftlen \n", argv[0]);return -1;}const unsigned len = atoi(argv[1]);if (len > FFT_MAX){printf("FFT length cannot be greater than %d.\n", FFT_MAX);return -1;}if (len < 16){printf("FFT length has to at least be 16.\n");return -1;}if ((len != 1) && (len & (len - 1))){printf("FFT length (%d) must be a power-of-2.\n", len);return -1;}printf("Block size: %d \n", blockSize);printf("Print result: %s \n", print ? "yes" : "no");int result = 0;result = runFFT(len);if (result == 0){printf("Successful.\n");if (print) printResult(len);}else{printf("Failed.\n");}cleanup();
}

fft.cpp


#include "clutil.h"
#include <time.h>
static unsigned workOffset;
static unsigned workSize;
static int p[FFT_MAX_LOG2N]       = {  1,   2,   4,   8,   16,   32,   64,   128,   256,   512,   1024,   2048,   4096,   8192,   16384,   32768};
static int twop[FFT_MAX_LOG2N]    = {2*1, 2*2, 2*4, 2*8, 2*16, 2*32, 2*64, 2*128, 2*256, 2*512, 2*1024, 2*2048, 2*4096, 2*8192, 2*16384, 2*32768};
static int threep[FFT_MAX_LOG2N]  = {3*1, 3*2 ,3*4, 3*8, 3*16, 3*32, 3*64, 3*128, 3*256, 3*512, 3*1024, 3*2048, 3*4096, 3*8192, 3*16384, 3*32768};
static int pminus1[FFT_MAX_LOG2N] = {1-1, 2-1, 4-1, 8-1, 16-1, 32-1, 64-1, 128-1, 256-1, 512-1, 1024-1, 2048-1, 4096-1, 8192-1, 16384-1, 32768-1};#ifndef M_PI
#define M_PI 3.14159265358979f
#endif
static cl_float minusPIoverp[FFT_MAX_LOG2N]     = {    -M_PI,         -M_PI/2.f,     -M_PI/4.f,     -M_PI/ 8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/ 64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/ 512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/ 8192.f,     -M_PI/16384.f,     -M_PI/32768.f};
static cl_float minusPIover2p[FFT_MAX_LOG2N]    = {    -M_PI/2.f,     -M_PI/4.f,     -M_PI/8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/8192.f,     -M_PI/16384.f,     -M_PI/32768.f,     -M_PI/65536.f};
static cl_float minusPIover2p_2x[FFT_MAX_LOG2N] = {-2.f*M_PI/2.f, -2.f*M_PI/4.f, -2.f*M_PI/8.f, -2.f*M_PI/16.f, -2.f*M_PI/32.f, -2.f*M_PI/64.f, -2.f*M_PI/128.f, -2.f*M_PI/256.f, -2.f*M_PI/512.f, -2.f*M_PI/1024.f, -2.f*M_PI/2048.f, -2.f*M_PI/4096.f, -2.f*M_PI/8192.f, -2.f*M_PI/16384.f, -2.f*M_PI/32768.f, -2.f*M_PI/65536.f};
static cl_float minusPIover2p_3x[FFT_MAX_LOG2N] = {-3.f*M_PI/2.f, -3.f*M_PI/4.f, -3.f*M_PI/8.f, -3.f*M_PI/16.f, -3.f*M_PI/32.f, -3.f*M_PI/64.f, -3.f*M_PI/128.f, -3.f*M_PI/256.f, -3.f*M_PI/512.f, -3.f*M_PI/1024.f, -3.f*M_PI/2048.f, -3.f*M_PI/4096.f, -3.f*M_PI/8192.f, -3.f*M_PI/16384.f, -3.f*M_PI/32768.f, -3.f*M_PI/65536.f};static int
radix(int N)
{int i = 0, j = 0;for (; i <= 31; i++){if ((N & (1 << i)) == 0){j++;}else{break;}}return (0 == (j%2)) ? 4 : 2;
}static unsigned int
log2NFFT(unsigned int size)
{unsigned int v = size;unsigned int log2n = 0;while (v >>= 1){log2n++;}return log2n;
}#define RADIX2_FFT_KERNEL "fft_radix2"
#define RADIX4_FFT_KERNEL "fft_radix4"static void
FFTGpu(const unsigned len)
{if (len == 0){return;}// figure out if we can use a radix-4 FFT : otherwise radix-2int rad = radix(len);if (4==rad && ((16==len) || (256==len) || (4096==len) || (65536==len) || (1048576 == len) ))rad = 2;// log2(n) is the # of kernels that will be invoked (for a radix-2 FFT)unsigned int log2n = log2NFFT(len);printf("log2(fft size) = log2(%d)=%d\n", len, log2n);printf("Compiling  radix-%d FFT Program for GPU...\n", rad);compileProgram("fft.cl");printf("creating radix-%d kernels...\n", rad);if (2 == rad){for (unsigned kk = 0; kk < log2n; kk++){printf("Creating kernel %s %d (p=%d)...\n", RADIX2_FFT_KERNEL, kk, p[kk]);createFFTKernel(RADIX2_FFT_KERNEL, kk);}}else{ // radix-4for (unsigned kk = 0; kk < log2n; kk+=2){printf("Creating kernel %s %d...\n", RADIX4_FFT_KERNEL, kk>>1);createFFTKernel(RADIX4_FFT_KERNEL, kk>>1);}}workSize = len;allocateDeviceMemory(workSize, workOffset);if (2 == rad){// FFT kernel invoked for p=1, p=2, ..., p=n/2// input and output swapped each timefor (unsigned kk = 0; kk < log2n; kk++){void *in = (0 == (kk&1)) ? &d_intime : &d_outfft;void *out = (0 == (kk&1)) ? &d_outfft : &d_intime;printf("Setting kernel args for kernel %d (p=%d)...\n", kk, p[kk]);clSetKernelArg(kernels[kk], 0, sizeof(cl_mem), in);clSetKernelArg(kernels[kk], 1, sizeof(cl_mem), out);clSetKernelArg(kernels[kk], 2, sizeof(unsigned), &p[kk]);clSetKernelArg(kernels[kk], 3, sizeof(unsigned), &pminus1[kk]);clSetKernelArg(kernels[kk], 4, sizeof(cl_float), &minusPIoverp[kk]);} // end (for 1,2,4,8,...N/2)}else{// radix-4, FFT kernel invoked for p=1, p=4, ..., p=n/4for (unsigned kk = 0; kk < log2n; kk+=2){int idx   = kk>>1;void *in  = (0 == (idx&1)) ? &d_intime : &d_outfft;void *out = (0 == (idx&1)) ? &d_outfft : &d_intime;printf("Setting kernel args for kernel %d (p=%d)...\n", idx, p[kk]);clSetKernelArg(kernels[idx], 0, sizeof(cl_mem), in);clSetKernelArg(kernels[idx], 1, sizeof(cl_mem), out);clSetKernelArg(kernels[idx], 2, sizeof(unsigned), &p[kk]);clSetKernelArg(kernels[idx], 3, sizeof(unsigned), &pminus1[kk]);clSetKernelArg(kernels[idx], 4, sizeof(unsigned), &twop[kk]);clSetKernelArg(kernels[idx], 5, sizeof(unsigned), &threep[kk]);clSetKernelArg(kernels[idx], 6, sizeof(cl_float), &minusPIover2p[kk]);clSetKernelArg(kernels[idx], 7, sizeof(cl_float), &minusPIover2p_2x[kk]);clSetKernelArg(kernels[idx], 8, sizeof(cl_float), &minusPIover2p_3x[kk]);} // end (for 1,4,16,...,N/4)} // end (if radix-2 or radix-4)size_t globalWorkSize[] = { (2==rad) ? (1<<(log2n-1)) : (len>>2) };size_t localWorkSize[] = { (blockSize <= globalWorkSize[0]) ? blockSize : globalWorkSize[0] };cl_int ciErrNum = 0;cl_mem d_result;clock_t start,end1,end2,end3;start = clock();Cl_finish();if (2==rad){for (unsigned kk = 0; kk < log2n; kk++){// note to self: up to 8 it works, beyond that it does notprintf("running kernel %d (p=%d)...\n", kk, p[kk]);runKernelFFT(localWorkSize, globalWorkSize, kk);d_result = (0 == (kk&1)) ? d_outfft : d_intime;}}else{// radix-4for (unsigned kk = 0; kk < log2n; kk+=2){int idx = kk>>1;printf("running kernel %d (p=%d)...\n", idx, p[kk]);runKernelFFT(localWorkSize, globalWorkSize, idx);d_result = (0 == (kk&1)) ? d_outfft : d_intime;}}Cl_finish();end1 = clock();//printf("time_1: %f s\n",double(end-start)/CLOCKS_PER_SEC);copyFromDevice(d_result, h_outfft + workOffset,  2*workSize);end2 = clock();//printf("time_2: %f s\n",double(end-start)/CLOCKS_PER_SEC);printGpuTime((2==rad)?log2n:(log2n>>1));end3 = clock();printf("time_1: %f s\ntime_2: %f s\ntime_3: %f s\n",\double(end1-start)/CLOCKS_PER_SEC,\double(end2-start)/CLOCKS_PER_SEC,\double(end3-start)/CLOCKS_PER_SEC);
}int
runFFT(const unsigned len)
{cl_int err;err = initExecution(len);if (err){return err;}FFTGpu(len);return 0;
}

clutil.cpp


#include "clutil.h"
#ifdef UNDER_CE
#include <windows.h>
#endif// global variables
cl_context cxContext = 0;
cl_program cpProgram = 0;
cl_device_id cdDeviceID[2];
cl_kernel kernels[FFT_MAX_LOG2N];
cl_command_queue commandQueue;
cl_event gpuExecution[FFT_MAX_LOG2N];#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))// default configs
unsigned blockSize = 16;
unsigned print = 1;// h_Freal and h_Fimag represent the input signal to be transformed.
// h_Rreal and h_Rimag represent the transformed output.
float*  h_Freal = 0;
float*  h_Fimag = 0;
float*  h_Rreal = 0;
float*  h_Rimag = 0;
//  real & imag interleaved
float* h_intime = 0; // time-domain input samples
float* h_outfft = 0; // freq-domain output samples// d_Freal and d_Fimag represent the input signal to be transformed.
// d_Rreal and d_Rimag represent the transformed output.
cl_mem d_Freal;
cl_mem d_Fimag;
cl_mem d_Rreal;
cl_mem d_Rimag;
//  real & imag interleaved
cl_mem d_intime; // time-domain input samples
cl_mem d_outfft; // freq-domain output samplesint
initExecution(const unsigned len)
{// Allocate host memory (and initialize input signal)allocateHostMemory(len);printf("Initializing device(s)...\n");// create the OpenCL context on available GPU devicesinit_cl_context(CL_DEVICE_TYPE_GPU);const cl_uint ciDeviceCount =  getDeviceCount();printf("ciDeviceCount:%d \n",ciDeviceCount);if (!ciDeviceCount){printf("No opencl specific devices!\n");return -1;}const cl_uint ciComputeUnitsCount = getNumComputeUnits();printf("# compute units = %d\n", ciComputeUnitsCount);printf("Creating Command Queue...\n");// create a command queue on device 0createCommandQueue();return 0;
}void
printGpuTime(const unsigned int kernelCount)
{double t, total = 0;for (unsigned k = 0; k<kernelCount; ++k){t = executionTime(gpuExecution[k]);printf("Kernel execution time on GPU (kernel %d) : %10.6f seconds\n", k, t);total += t;}printf("Total Kernel execution time on GPU : %10.6f seconds\n",total);
}void
printResult(const unsigned size)
{FILE *fp;
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, "fft_output.csv");fp = fopen(path, "w+");
#elsefp = fopen("fft_output.csv", "w+");
#endifif (fp == NULL) return;for (unsigned i = 0; i < size; ++i){fprintf(fp, "%f,%f\n", h_outfft[2*i], h_outfft[2*i+1]);}fclose(fp);
}double
executionTime(const cl_event event)
{cl_ulong start, end;cl_int err;err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);if (err){return 0;}printf("start:%llu, end:%llu \n", start, end);return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds
}void
allocateHostMemory(const unsigned len)
{h_Freal = (float *) malloc(sizeof(float) * len);checkError((h_Freal != NULL), CL_TRUE, "Could not allocate memory");h_Fimag = (float *) malloc(sizeof(float) * len);checkError((h_Fimag != NULL), CL_TRUE, "Could not allocate memory");h_Rreal = (float *) malloc(sizeof(float) * len);checkError((h_Rreal != NULL), CL_TRUE, "Could not allocate memory");h_Rimag = (float *) malloc(sizeof(float) * len);checkError((h_Rimag != NULL), CL_TRUE, "Could not allocate memory");//  real/imag interleaved input time-domain samplesh_intime = (float *) malloc(sizeof(float) * len * 2);checkError((h_intime != NULL), CL_TRUE, "Could not allocate memory");//  real/imag interleaved output FFT datah_outfft = (float *) malloc(sizeof(float) * len * 2);checkError((h_outfft != NULL), CL_TRUE, "Could not allocate memory");const unsigned n = 16;for (unsigned i = 0 ; i < len; ++i){h_Freal[i] = (i + 1) % n;h_Fimag[i] = (i + 1) % n;h_intime[2*i] = h_intime[2*i+1] = (i + 1) % n;h_Rreal[i] = 0;h_Rimag[i] = 0;h_outfft[2*i] = h_outfft[2*i+1] = 0;}if (print){FILE *fp = NULL;
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, "fft_input.csv");fp = fopen(path, "w+");
#elsefp = fopen("fft_input.csv", "w+");
#endifif (fp == NULL) return;for (unsigned int kk=0; kk<len; kk++){fprintf(fp, "%f,%f\n", h_intime[2*kk], h_intime[2*kk+1]);}fclose(fp);}
}void
allocateDeviceMemory(const unsigned size,const unsigned copyOffset)
{d_Freal = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Freal + copyOffset);copyToDevice(d_Freal,  h_Freal + copyOffset, size);d_Fimag = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Fimag + copyOffset);copyToDevice(d_Fimag,  h_Fimag + copyOffset, size);//  copy real/imag interleaved input data to deviced_intime = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_intime + copyOffset * 2);copyFromDevice(d_intime, h_outfft, size * 2); // debugd_Rreal = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rreal + copyOffset);copyToDevice(d_Rreal,  h_Rreal + copyOffset, size);d_Rimag = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rimag + copyOffset);copyToDevice(d_Rimag,  h_Rimag + copyOffset, size);//  copy real/imag interleaved out FFT to deviced_outfft = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_outfft + copyOffset * 2);copyToDevice(d_intime,  h_outfft + copyOffset * 2, size * 2);
}void
cleanup(void)
{if (d_Freal)  clReleaseMemObject(d_Freal);if (d_Fimag)  clReleaseMemObject(d_Fimag);if (d_Rreal)  clReleaseMemObject(d_Rreal);if (d_Rimag)  clReleaseMemObject(d_Rimag);if (d_intime) clReleaseMemObject(d_intime);if (d_outfft) clReleaseMemObject(d_outfft);for (unsigned kk=0; kk<ARRAY_SIZE(kernels); kk++) {if (gpuExecution[kk]) clReleaseEvent(gpuExecution[kk]);}if (commandQueue) clReleaseCommandQueue(commandQueue);if (cpProgram) clReleaseProgram(cpProgram);if (cxContext) clReleaseContext(cxContext);free(h_Freal);h_Freal = 0;free(h_Fimag);h_Fimag = 0;free(h_Rreal);h_Rreal = 0;free(h_Rimag);h_Rimag = 0;free(h_intime);h_intime = 0;free(h_outfft);h_outfft = 0;
}void
checkError(const cl_int ciErrNum,const cl_int ref,const char* const operation)
{if (ciErrNum != ref) {printf("ERROR:: %d %s failed\n\n", ciErrNum, operation);cleanup();exit(EXIT_FAILURE);}
}void
init_cl_context(const cl_device_type device_type)
{cl_int ciErrNum = CL_SUCCESS;#ifndef WIN32cxContext = clCreateContextFromType(0, /* cl_context_properties */device_type,NULL, /* error function ptr */NULL, /* user data to be passed to err fn */&ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#elsecl_platform_id cpPlatform;ciErrNum =     clGetPlatformIDs(1, &cpPlatform, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");cl_uint uiNumDevices;ciErrNum = clGetDeviceIDs(cpPlatform, device_type, 0, NULL, &uiNumDevices);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_device_id cdDevices[20];ciErrNum = clGetDeviceIDs(cpPlatform, device_type, uiNumDevices, cdDevices, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_uint targetDevice=0, uiNumDevsUsed=1;cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#endif
}cl_uint
getDeviceCount(void)
{size_t nDeviceBytes;const cl_int ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);checkError(ciErrNum, CL_SUCCESS, "clGetContextInfo");return ((cl_uint)nDeviceBytes/sizeof(cl_device_id));
}cl_uint
getNumComputeUnits(void)
{cl_platform_id cpPlatform;cl_int ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");//Get all the devicesprintf("Get the Device info and select Device...\n");cl_uint uiNumDevices;ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");// Set target device and Query number of compute units on targetDeviceprintf("# of Devices Available = %d\n", uiNumDevices);cl_uint num_compute_units;clGetDeviceInfo(cdDevices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);printf("# of Compute Units = %d\n", num_compute_units);free(cdDevices);return num_compute_units;
}void
createCommandQueue(void)
{cl_int ciErrNum = CL_SUCCESS;ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*2, &cdDeviceID, NULL);commandQueue = clCreateCommandQueue(cxContext, cdDeviceID[0], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateCommandQueue");
}void
compileProgram(const char* const kernel_file)
{size_t program_length;FILE* pFileStream = NULL;cl_int ciErrNum;#ifdef _WIN32
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, kernel_file);pFileStream = fopen(path, "rb");if (pFileStream == NULL){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#elseif(fopen_s(&pFileStream, kernel_file, "rb") != 0){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#endif
#elsepFileStream = fopen(kernel_file, "rb");if(pFileStream == 0){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#endif// get the length of the source codefseek(pFileStream, 0, SEEK_END);program_length = ftell(pFileStream);fseek(pFileStream, 0, SEEK_SET);// allocate a buffer for the source code string and read it inchar* source = (char *)malloc(program_length + 1);if (fread((source), program_length, 1, pFileStream) != 1){fclose(pFileStream);free(source);checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on read source");}fclose(pFileStream);source[program_length] = '\0';// Create the program for all GPUs in the contextcpProgram = clCreateProgramWithSource( cxContext, 1, (const char **) &source, &program_length, &ciErrNum);free(source);checkError(ciErrNum, CL_SUCCESS, "clCreateProgramWithSource");ciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);if (ciErrNum != CL_SUCCESS){char cBuildLog[10240];clGetProgramBuildInfo(cpProgram, cdDeviceID[0], CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL );printf("\nBuild Log : \n%s\n", cBuildLog);checkError(ciErrNum, CL_SUCCESS, "clBuildProgram");}
}void
createFFTKernel(const char* const kernelName,int kk)
{cl_int ciErrNum = CL_SUCCESS;kernels[kk] = clCreateKernel(cpProgram, kernelName, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateKernel");
}cl_mem
createDeviceBuffer(const cl_mem_flags flags,const size_t size,void* const hostPtr)
{cl_int ciErrNum = CL_SUCCESS;const cl_mem d_mem = clCreateBuffer(cxContext, flags | CL_MEM_COPY_HOST_PTR, size, hostPtr, &ciErrNum);checkError(ciErrNum, CL_SUCCESS,  "clCreateBuffer");return d_mem;
}void
copyToDevice(const cl_mem mem,float* const hostPtr,const unsigned size)
{const cl_int ciErrNum = clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);checkError(ciErrNum, CL_SUCCESS,  "clEnqueueWriteBuffer");
}void
copyFromDevice(const cl_mem dMem,float* const hostPtr,const unsigned size)
{cl_int ciErrNum = clEnqueueReadBuffer(commandQueue, dMem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);checkError(ciErrNum, CL_SUCCESS, "clEnqueueReadBuffer");
}void
runKernelFFT(const size_t localWorkSize[],const size_t globalWorkSize[],const int kk)
{const cl_int ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernels[kk], 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &gpuExecution[kk]);checkError(ciErrNum, CL_SUCCESS, "clEnqueueNDRangeKernel");
}void Cl_finish(void)
{clFinish(commandQueue);
}

clutil.h


#ifndef __CLUTIL__
#define __CLUTIL__#include <CL/opencl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef UNDER_CE
#include <sys/types.h>
#endifint initExecution(const unsigned len);
void checkError(const cl_int ciErrNum, const cl_int ref, const char* const operation);
void printResult(const unsigned size);
void init_cl_context(cl_device_type device_type);
cl_uint getDeviceCount();
cl_uint getNumComputeUnits();
void createCommandQueue();
void compileProgram(const char* const kernel_file);
void createFFTKernel(const char* const kernelName, int kk);
cl_mem createDeviceBuffer(const cl_mem_flags flags, const size_t size, void* const  hostPtr);
void runKernelFFT(const size_t localWorkSize[], const size_t globalWorkSize[], const int kk);
void copyToDevice(const cl_mem mem, float* const hostPtr, const unsigned size);
void copyFromDevice(const cl_mem dMem, float* const hostPtr, const unsigned size);
double executionTime(const cl_event event);
void allocateHostMemory(const unsigned len);
void allocateDeviceMemory(const unsigned size, const unsigned copyOffset);
void printGpuTime(const unsigned int kernelCount);
void cleanup();
int runFFT(const unsigned len);
void Cl_finish(void);// Support 2^16 = 65536 point FFT
#define FFT_MAX_LOG2N   20
#define FFT_MAX         (1 << FFT_MAX_LOG2N)extern unsigned blockSize;
extern unsigned print;extern float*  h_Freal;
extern float*  h_Fimag;
extern float*  h_Rreal;
extern float*  h_Rimag;
extern float*  h_intime; // time-domain input samples
extern float*  h_outfft; // freq-domain output samplesextern cl_mem d_Freal;
extern cl_mem d_Fimag;
extern cl_mem d_Rreal;
extern cl_mem d_Rimag;
extern cl_mem d_intime; // time-domain input samples
extern cl_mem d_outfft; // freq-domain output samplesextern cl_context cxContext;
extern cl_program cpProgram;
extern cl_kernel kernels[FFT_MAX_LOG2N];
extern cl_event gpuExecution[FFT_MAX_LOG2N];
extern cl_command_queue commandQueue;#endif

fft.cl

#define M_PI            3.14159265358979f
#define MUL_RE(a, b)    (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a, b)    (a.even*b.odd + a.odd*b.even)typedef float2 real2_t;
typedef float real_t;// complex multiply
real2_t mul(real2_t a,real2_t b)
{return (real2_t) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); // no mad
}// twiddle_P_Q(A) returns A * EXP(-P*PI*i/Q)
real2_t
twiddle_1_2(real2_t a)
{// A * (-i)return (real2_t) (a.y, -a.x);
}// Return A * exp(K*ALPHA*i)
real2_t
twiddle(real2_t a,int k,real_t alpha)
{real_t cs, sn;//sn = sincos((real_t)k*alpha, &cs);cs = native_cos((real_t) k * alpha);sn = native_sin((real_t) k * alpha);return mul(a, (real2_t) (cs, sn));
}// Return A * exp(KALPHA*i)
real2_t
twiddle_kalpha(real2_t a,real_t kalpha)
{real_t cs, sn;//sn = sincos((real_t) alpha, &cs);cs = native_cos((real_t) kalpha);sn = native_sin((real_t) kalpha);return mul(a, (real2_t) (cs, sn));
}// In-place DFT-2, output is (a, b).  Arguments must be variables.
#define DFT2(a, b)  { real2_t tmp = a - b; a += b; b = tmp; }// Compute T x DFT-2.
// T is the number of threads.
// N = 2*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,2,4,...,T.
// Each DFT-2 has input (X[I],X[I+T]), I=0..T-1,
// and output Y[J], Y|J+P], J = I with one 0 bit inserted at postion P. */
__kernel void
fft_radix2(__global const real2_t * x,__global real2_t * y,int p,int pminus1,real_t minusPIoverp)
{int t = get_global_size(0); // thread countint i = get_global_id(0);   // thread indexint k = i&pminus1;          // index in input sequence, in 0..P-1int j = ((i-k)<<1) + k;     // output indexreal_t alpha = minusPIoverp * (real_t) k;  // -M_PI*(real_t)k/(real_t)p;// Read and twiddle inputx += i;real2_t u0 = x[0];//real2_t u1 = twiddle(x[t], 1, alpha);real_t cs,sn;//sn = sincos(alpha, &cs);cs = native_cos(alpha);sn = native_sin(alpha);real2_t u1 = mul(x[t], (real2_t) (cs, sn));// In-place DFT-2DFT2(u0,u1);// Write outputy += j;y[0] = u0;y[p] = u1;
}// In-place DFT-4, output is (a, c, b, d). Arguments must be variables.
#define DFT4(a, b, c, d) { DFT2(a, c); DFT2(b, d); d=twiddle_1_2(d); DFT2(a, b); DFT2(c, d); }// Compute T x DFT-4.
// T is the number of threads.
// N = 4*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,4,16,...,T.
// Each DFT-4 has input (X[I],X[I+T],X[I+2*T],X[I+3*T]), I=0..T-1,
// and output (Y[J],Y|J+P],Y[J+2*P],Y[J+3*P], J = I with two 0 bits inserted at postion P.
__kernel void
fft_radix4(__global const float2 * x,__global float2 * y,int p,int pminus1,int twop,int threep,real_t minusPIover2p,real_t minusPIover2p_2x,real_t minusPIover2p_3x)
{int t = get_global_size(0); // thread countint i = get_global_id(0);   // thread indexint k = i&pminus1;          //(p-1); // index in input sequence, in 0..P-1int j = ((i - k) << 2) + k; // output indexreal_t alpha   = minusPIover2p    * (real_t) k; //-M_PI*(real_t)k/(real_t)(2*p);real_t alpha2x = minusPIover2p_2x * (real_t) k;real_t alpha3x = minusPIover2p_3x * (real_t) k;// Read and twiddle inputx += i;real2_t u0 = x[0];real2_t u1 = twiddle_kalpha(x[t],   alpha);     //twiddle(x[t],   1, alpha);real2_t u2 = twiddle_kalpha(x[2*t], alpha2x);   //twiddle(x[2*t], 2, alpha);real2_t u3 = twiddle_kalpha(x[3*t], alpha3x);   //twiddle(x[3*t], 3, alpha);// In-place DFT-4DFT4(u0, u1, u2, u3);// Shuffle and write outputy        += j;y[0]      = u0;y[p]      = u2;y[twop]   = u1;y[threep] = u3;
}

将该部分代码拷贝至~/imx8mqevk-rootfs/code/fft目录下
执行编译命令

$CXX -I../../usr/include -L../../usr/lib -lOpenCL -o fft main.cpp fft.cpp clutil.cpp
# 注意:  这里的$CXX的作用就是交叉编译工具, 他在脚本中设置为环境变量了。

结果如下

IMX8M系列 OpenCL FFT 示例编译及其他demo测试(MYD-JX8MX)相关推荐

  1. ubuntu编译ffmpeg并且demo测试

    一.Ubuntu下编译ffmpeg源码指令 下载ffmpeg,解压(跳过),需要安装一些基本的依赖库,如x264等: 切换到ffmpeg源码的目录,config指令 $ ./configure --e ...

  2. IMX8M系列 yocto编译镜像及demo编译(MYD-JX8MX)

    IMX8M系列 yocto编译镜像及demo编译(MYD-JX8MX) 前段时间由于工作需要,研究了一下米尔的MYD-JX8MX开发板,用的是NXP 的IMX8M型号芯片,说实话,官方提供的文档描述的 ...

  3. OpenCL,OpenGL编译

    OpenCL,OpenGL编译 TVM已经支持多个硬件后端:CPU,GPU,移动设备等-添加了另一个后端:OpenGL / WebGL. OpenGL / WebGL能够在没有安装CUDA的环境中利用 ...

  4. opencl fft实例整理

    ocl版的: https://github.com/betaupsx86/FFT_OCL python版的: A Python wrapper for the OpenCL FFT library c ...

  5. CUDA并行算法系列之FFT快速卷积

    CUDA并行算法系列之FFT快速卷积 卷积定义 在维基百科上,卷积定义为: 离散卷积定义为: [ 0, 1, 2, 3]和[0, 1, 2]的卷积例子如下图所示: Python实现(直接卷积) 根据离 ...

  6. libev学习系列之三:libev编译安装

    libev学习系列之三:libev编译安装 版本说明 版本 作者 日期 备注 0.1 ZY 2019.5.31 初稿 目录 文章目录 libev学习系列之三:libev编译安装 版本说明 目录 源码结 ...

  7. 【已解决】海康威视MFC综合示例(C++ 官网Demo)采用VS2019编译异常如何解决?

    采用VS2019编译运行海康威视MFC综合示例Demo 一.文章背景: 二.操作步骤: 1.海康威视设备网络SDK下载: 2.VS2019 MFC开发环境配置: 3.MFC综合示例编译运行: 三.小结 ...

  8. ESP32-WROOM-32D模组上传Arduino IDE示例编译的固件后总是重启,该如何解决

    手头的ESP32-WROOM-32D模组搭载的是单核CPU,因此Arduino IDE需要配置成单核编译模式.用ESP32的示例编译并上传固件后不断重启: 例如GetChipID,源代码: uint3 ...

  9. laya3d系列——如何用vscode编译laya项目

    laya3d系列--如何用vscode编译laya项目 ---------------------------------------------转载请说明出处,抄袭必举报查封------------ ...

最新文章

  1. 框架页面jquery装载
  2. 跟前腾讯总监学Java实战项目
  3. How is NGINX Unit different from Apache?
  4. Mac字体安装的方法?Mac怎么安装新字体?Mac字体安装教程
  5. 多媒体计算机辅助英语教学,多媒体计算机辅助初中英语教学的实践研究
  6. 工商数据采集的10个经典方法
  7. 学计算机的应届生怎么面试自我介绍,最新应届生面试自我介绍(精选3篇)
  8. 图数据和知识图谱,数字化转型的新引擎星环科技星环科技
  9. keras中model.compile()基本用法
  10. 日常开发中linux中最常用的100条命令
  11. androidstudio身高预测app
  12. 揭秘工业互联网的内涵、热点与难点!
  13. 【图文并茂】手把手教你重装Win10系统
  14. 谈谈keep-alive的理解
  15. b站React禹哥版视频笔记-React应用(基于react脚手架)
  16. 刷脸支付在流量金贵时代把控千万用户
  17. win10找不到文件无法卸载的解决方法
  18. createjs开发教程
  19. 满分回答教你如何应对面试中项目经验这一难关
  20. go语言读文件 java读文件_如何从Java中的文本文件逐行读取

热门文章

  1. UE4蓝图入门知识及UI制作控件事件
  2. php+mysql注销_PHP+mysql如何实现屏蔽用户的功能?
  3. 设计模式——Visitor(访问者)模式
  4. java通用文件换行符_java通用文件换行符
  5. Python图形界面设计
  6. hmailserver怎么搭建php,hMailServer设置
  7. Python中的 // 和 / 和 % 的用法区别
  8. i7 10700和i5 10400F参数对比差距大吗
  9. 人生的路,每一步都算数
  10. 如何不浪费青春,让游戏快速上架 Steam