转载请标明出处:https://blog.csdn.net/u013752202/article/details/92794209

1.创建opencl kernel

(1)创建kerenl并编译shading文件

(2)获取kernel工作组信息

2.GPU和CPU协同工作

(1)创建内存,并同步到CPU和GPU,如:

tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));

opencl_sync_mem(tablebuffer, TOCPU);

opencl_sync_mem(tablebuffer, TOGPU);

(2)设置GPU kernel参数,如:

int error;

size_t alignblocks;

cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);

opencl_set_mem(kernel, inputBuffer, 0);

opencl_set_mem(kernel, yuvBuffer, 1);

opencl_set_mem(kernel, tablebuffer, 2);

opencl_set_mem(kernel, offsetbuffer, 3);

error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);

assert(CL_SUCCESS == error);

error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);

assert(CL_SUCCESS == error);

(3)加入GPU渲染管线,并等待计算完成,如:

error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);

assert(CL_SUCCESS == error);

aclFinish(context->queue);

JpegNativeLib.cpp (JNI) :


//
// Created by Administrator on 2017/1/18.
//
extern "C"
{
#include "jpeglib.h"
#include <opencl/jpeg_opencl.h>
};
#include <jni.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <unistd.h>#include <android/log.h>
#define   LOG_TAG    "MYLOG"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)#include <UsrGLED.h>
using namespace UGLES;void preDecode(FILE *infile, jpeg_decompress_struct *cinfo)
{struct jpeg_error_mgr jerr;cinfo->err = jpeg_std_error(&jerr);jpeg_stdio_src(cinfo, infile);(void) jpeg_read_header(cinfo, TRUE);/*必须设成float方式,若是cpu解码,这个会影响性能,但对于gpu来说没有关系,而且float方式精度最高,几乎不会造成图片失真*/cinfo->dct_method = JDCT_FLOAT;(void) jpeg_start_decompress(cinfo);auto width = cinfo->output_width;auto height = cinfo->output_height;LOGI("W=%d,H=%d",width,height);
}int jpegDecode(GLuint glTextureId,int flag)
{LOGI("Java_zhw_jpegclapp_MainActivity_jpegDecode");
{
#if 1/*opencl 解码的 api*/const char *inputfile = "/sdcard/3040x1520_4.jpeg";struct jpeg_decompress_struct cinfo;jpeg_create_decompress(&cinfo);//创建解码器FILE *infile = fopen(inputfile, "rb");if (NULL == infile) {LOGE("Open input file err.");return -1;}preDecode(infile, &cinfo);LOGI("GPU DECODE START");int ret = jpeg_decode_by_opencl(&cinfo,glTextureId,flag);fclose(infile);if (-1 == ret) {LOGE("jpeg_decode_by_opencl failed");return -1;}LOGI("GPU DECODE END");
#endif//用得到的pixels做一些事情//这里用abort而不是finish,直接中止掉(void) jpeg_abort_decompress(&cinfo);jpeg_destroy_decompress(&cinfo);return 0;
}}UsrGLES usrGLES;
GLuint textureId=0;extern "C"
void Java_zhw_jpegclapp_GL2JNILib_init(JNIEnv * env, jobject obj,  jint width, jint height)
{usrGLES.setupGraphics(width, height);textureId=usrGLES.getTextureID();if (-1 == openclInit()) {LOGE("openclInit failed");return;}if(-1==jpegDecode(textureId,0)) {LOGE("jpegDecode error !");return;}
}extern "C"
void Java_zhw_jpegclapp_GL2JNILib_step(JNIEnv * env, jobject obj)
{TIME_START;if(-1==jpegDecode(textureId,1)) {LOGE("jpegDecode error !");return;}TIME_END;usrGLES.renderFrame();
}extern "C"
void Java_zhw_jpegclapp_GL2JNILib_stop(JNIEnv * env, jobject obj)
{openclDestroy();
}

jpeg_opencl.h


#ifndef JPEG_OPENCL_H
#define JPEG_OPENCL_H
#include "opencl_package.h"
#include <stdio.h>
#include "jpeglib.h"
#include "jpegint.h"
#define DEUBUG_ON
#ifdef DEUBUG_ON
#include <sys/time.h>#include <android/log.h>
#include <GLES2/gl2.h>#define   LOG_TAG    "MYLOG"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)#define TIME_START struct timeval tv_start, tv_end;gettimeofday(&tv_start, NULL);
#define TIME_END gettimeofday(&tv_end, NULL); LOGI("Time = %fms in %s, %d\n", (tv_end.tv_sec*1000000 + tv_end.tv_usec - tv_start.tv_sec*1000000 - tv_start.tv_usec)/1000.0f, __FUNCTION__, __LINE__);gettimeofday(&tv_start, NULL);
#else
#define TIME_START
#define TIME_END
#endif
opencl_context* jpeg_get_context();
typedef enum
{IDCT_FLOAT=0,YUV_RGB,KERNELNUMBER
}KERNELNAME;cl_kernel jpeg_get_kernel(KERNELNAME name);
int jpeg_decode_by_opencl(j_decompress_ptr cinfo, GLuint glTextureId,int flag);
int openclInit();
void openclDestroy();#endif

jpeg_opencl.c


#include "jpeg_opencl.h"
#include <pthread.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>#include <android/log.h>
#include <OpenCL/cl_gl.h>
#include <unistd.h>#include "aopencl.h"#define   LOG_TAG    "CL"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)static opencl_context* gInstance = NULL;
static pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
static cl_kernel gKernel[KERNELNUMBER];
static size_t gMaxGroupSize[KERNELNUMBER];
#include "KernelWarp.h"
static void initKernel(opencl_context* context)
{int error;gKernel[IDCT_FLOAT] = opencl_compile_create_kernel(context, idct_kernel_clclh, "idct_float");gKernel[YUV_RGB] = opencl_compile_create_kernel(context,yuv_rgb_clclh, "yuv_rgb");LOGI("gInstance->device_id=0x%x",gInstance->device_id);error = aclGetKernelWorkGroupInfo(gKernel[IDCT_FLOAT], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+IDCT_FLOAT, NULL);LOGI("(gMaxGroupSize+IDCT_FLOAT)=%d",*(gMaxGroupSize+IDCT_FLOAT));assert(CL_SUCCESS == error);error = aclGetKernelWorkGroupInfo(gKernel[YUV_RGB], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+YUV_RGB, NULL);assert(CL_SUCCESS == error);
}opencl_context* jpeg_get_context()
{if (NULL == gInstance){pthread_mutex_lock(&gMutex);if (NULL == gInstance){gInstance = opencl_create_context(1);initKernel(gInstance);}pthread_mutex_unlock(&gMutex);}return gInstance;
}
cl_kernel jpeg_get_kernel(KERNELNAME name)
{assert(NULL!=gInstance);return gKernel[name];
}struct ComponentInfo
{int max_x_sample;int max_y_sample;int YW;int YH;int UW;int UH;int UOffset;int VW;int VH;int VOffset;int blocksInMCU;int MCU_Per_Row;
};opencl_mem* rgbBuffer;opencl_context* context;
opencl_mem* inputBuffer;
JBLOCK* mcu;
JBLOCK** MCU_buffer;
opencl_mem* yuvBuffer;
opencl_mem* tablebuffer;
opencl_mem* offsetbuffer;
int currentoffset = 0;
size_t totalBlock;
size_t totalMCUNumber;int openclInit()
{initFns();TIME_START;/*if (cinfo->dct_method != JDCT_FLOAT || cinfo->comps_in_scan != 3 || DCTSIZE != 8){return -1;}*/context = jpeg_get_context();TIME_END;return 0;
}void openclDestroy()
{opencl_destroy_mem(yuvBuffer);opencl_destroy_mem(rgbBuffer);opencl_destroy_mem(inputBuffer);opencl_destroy_context(context);
}int gloalValInit(j_decompress_ptr cinfo,GLuint glTextureId)
{TIME_START;totalMCUNumber = cinfo->MCU_rows_in_scan * cinfo->MCUs_per_row;totalBlock = totalMCUNumber*cinfo->blocks_in_MCU;for (int i=0;i<3; ++i){assert(cinfo->cur_comp_info[i]->DCT_scaled_size == 8);}inputBuffer = opencl_create_mem(context, totalBlock * sizeof(JBLOCK));TIME_END;cl_int err;LOGI("glTextureId =%d",glTextureId);rgbBuffer = (opencl_mem *) malloc(sizeof(opencl_mem));rgbBuffer->size = 4 * sizeof(unsigned char) * DCTSIZE2 * cinfo->MCU_rows_in_scan *cinfo->MCUs_per_row * cinfo->max_h_samp_factor * cinfo->max_v_samp_factor;rgbBuffer->queue = context->queue;rgbBuffer->map = NULL;rgbBuffer->base = aclCreateFromGLTexture(context->context, CL_MEM_READ_WRITE, GL_TEXTURE_2D,0, glTextureId, &err);if (0 != err) {//CL_INVALID_CONTEXTLOGE("clCreateFromGLTexture2D error,%p,err=%d", rgbBuffer->base, err);return -1;}else {LOGI("clCreateFromGLTexture success,rgbBuffer->base=%p", rgbBuffer->base);}yuvBuffer = opencl_create_mem(context, totalMCUNumber*cinfo->blocks_in_MCU * DCTSIZE2 * sizeof(unsigned char));TIME_END;return 0;
}int jpeg_decode_by_opencl(j_decompress_ptr cinfo,GLuint glTextureId,int flag)
{if(0==flag){if(-1==gloalValInit(cinfo,glTextureId)){return  -1;}}TIME_START;opencl_sync_mem(inputBuffer, TOCPU);TIME_END;jzero_far(inputBuffer->map, totalMCUNumber*cinfo->blocks_in_MCU * sizeof(JBLOCK));MCU_buffer = (JBLOCK**)malloc(sizeof(JBLOCK*)*cinfo->blocks_in_MCU);mcu = (JBLOCK*)inputBuffer->map;TIME_END;LOGI("totalMCUNumber=%d",totalMCUNumber);int i, blkn;for (i = 0; i < totalMCUNumber; ++i){for (blkn = 0; blkn < cinfo->blocks_in_MCU; ++blkn){MCU_buffer[blkn] = mcu + cinfo->blocks_in_MCU*i + blkn;}if ( FALSE == (*cinfo->entropy->decode_mcu) (cinfo, MCU_buffer)){break;}}free(MCU_buffer);opencl_sync_mem(inputBuffer, TOGPU);TIME_END;/*Upload quantry table*/offsetbuffer = opencl_create_mem(context, cinfo->blocks_in_MCU*sizeof(int));tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));opencl_sync_mem(tablebuffer, TOCPU);opencl_sync_mem(offsetbuffer, TOCPU);for (i = 0; i< cinfo->comps_in_scan; ++i){int j;jpeg_component_info* info = cinfo->cur_comp_info[i];float* table = (float*)info->dct_table;//FIXMEfloat* table_in_buffer = (float*)(tablebuffer->map) + DCTSIZE2*i;int* offset = (int*)offsetbuffer->map;memcpy(table_in_buffer, table, sizeof(float)*DCTSIZE2);for (j=0; j<info->h_samp_factor*info->v_samp_factor; ++j){offset[currentoffset + j] = i;}currentoffset+=info->h_samp_factor*info->v_samp_factor;}TIME_END;opencl_sync_mem(tablebuffer, TOGPU);opencl_sync_mem(offsetbuffer, TOGPU);/*idct*/{int error;size_t alignblocks;cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);opencl_set_mem(kernel, inputBuffer, 0);opencl_set_mem(kernel, yuvBuffer, 1);opencl_set_mem(kernel, tablebuffer, 2);opencl_set_mem(kernel, offsetbuffer, 3);error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);assert(CL_SUCCESS == error);error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);assert(CL_SUCCESS == error);alignblocks = (totalBlock + gMaxGroupSize[IDCT_FLOAT]-1)/gMaxGroupSize[IDCT_FLOAT]*gMaxGroupSize[IDCT_FLOAT];error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);assert(CL_SUCCESS == error);aclFinish(context->queue);}opencl_destroy_mem(tablebuffer);opencl_destroy_mem(offsetbuffer);TIME_END;/*TODO Sample YUV to RGB*/{int stride = cinfo->MCUs_per_row * cinfo->max_h_samp_factor * DCTSIZE;struct ComponentInfo info;size_t global[2] = {cinfo->MCUs_per_row*cinfo->max_h_samp_factor, cinfo->MCU_rows_in_scan*cinfo->max_v_samp_factor*DCTSIZE};cl_kernel kernel = jpeg_get_kernel(YUV_RGB);opencl_sync_mem(rgbBuffer, TOGPU);assert(NULL!=rgbBuffer);info.YW = cinfo->cur_comp_info[0]->h_samp_factor;info.YH = cinfo->cur_comp_info[0]->v_samp_factor;info.UW = cinfo->cur_comp_info[1]->h_samp_factor;info.UH = cinfo->cur_comp_info[1]->v_samp_factor;info.VW = cinfo->cur_comp_info[2]->h_samp_factor;info.VH = cinfo->cur_comp_info[2]->v_samp_factor;info.UOffset = info.YW*info.YH;info.VOffset = info.UOffset + info.UW*info.UH;info.blocksInMCU = cinfo->blocks_in_MCU;info.MCU_Per_Row = cinfo->MCUs_per_row;info.max_x_sample = cinfo->max_h_samp_factor;info.max_y_sample = cinfo->max_v_samp_factor;
//        LOGI("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",info.YW,info.YH,info.UW,info.UH,info.VW,info.VH,info.UOffset,info.VOffset,info.blocksInMCU,info.MCU_Per_Row,info.max_x_sample,info.max_y_sample);
//        LOGI("stride=%d",stride);{int error ;opencl_set_mem(kernel, yuvBuffer, 0);opencl_set_mem(kernel, rgbBuffer, 1);aclSetKernelArg(kernel, 3, sizeof(int), &info.max_x_sample);aclSetKernelArg(kernel, 4, sizeof(int), &info.max_y_sample);aclSetKernelArg(kernel, 5, sizeof(int), &info.YW);aclSetKernelArg(kernel, 6, sizeof(int), &info.YH);aclSetKernelArg(kernel, 7, sizeof(int), &info.UW);aclSetKernelArg(kernel, 8, sizeof(int), &info.UH);aclSetKernelArg(kernel, 9, sizeof(int), &info.UOffset);aclSetKernelArg(kernel, 10, sizeof(int), &info.VW);aclSetKernelArg(kernel, 11, sizeof(int), &info.VH);aclSetKernelArg(kernel, 12, sizeof(int), &info.VOffset);aclSetKernelArg(kernel, 13, sizeof(int), &info.blocksInMCU);aclSetKernelArg(kernel, 14, sizeof(int), &info.MCU_Per_Row);assert(CL_SUCCESS == error);error = aclSetKernelArg(kernel, 2, sizeof(int), &stride);assert(CL_SUCCESS == error);error = aclEnqueueNDRangeKernel(context->queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);assert(CL_SUCCESS == error);aclFinish(context->queue);}/*Copy rgbbuffer to output_buf*/TIME_END;}//opencl_destroy_mem(yuvBuffer);//opencl_destroy_mem(rgbBuffer);LOGI("jpeg_decode_by_opencl END");return 1;
}

KernelWrap.h (shading程序)


const char* idct_kernel_clclh =
"#define DCTSIZE 8\n"
"#define DCTSIZE2 64\n"
"#define LOADSRC(i, src) convert_float8(vload8(i, src))*vload8(i, table)\n"
"__kernel void idct_float(__global short* input, __global unsigned char* output, __global const float* dequantilize_table, __global const int* order, int blocks_per_mcu, uint totalblocks)\n"
"{\n"
"    int blkn = get_global_id(0);\n"
"    if (blkn < totalblocks)\n"
"    {\n"
"        __global short* src = input + DCTSIZE2*blkn;\n"
"        __global unsigned char* outptr = output + DCTSIZE2*blkn;\n"
"        __global const float* table = dequantilize_table + order[blkn % blocks_per_mcu]*DCTSIZE2;\n"
"        float8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;\n"
"        float8 w0, w1, w2, w3, w4, w5, w6, w7;\n"
"        float8 tmp10, tmp11, tmp12, tmp13;\n"
"        float8 z5, z10, z11, z12, z13;\n"
"        tmp0 = LOADSRC(0, src);\n"
"        tmp1 = LOADSRC(2, src);\n"
"        tmp2 = LOADSRC(4, src);\n"
"        tmp3 = LOADSRC(6, src);\n"
"        tmp10 = tmp0 + tmp2;    /* phase 3 */\n"
"        tmp11 = tmp0 - tmp2;\n"
"        \n"
"        tmp13 = tmp1 + tmp3;    /* phases 5-3 */\n"
"        tmp12 = (tmp1 - tmp3) * (float8)1.414213562 - tmp13; /* 2*c4 */\n"
"        \n"
"        tmp0 = tmp10 + tmp13;    /* phase 2 */\n"
"        tmp3 = tmp10 - tmp13;\n"
"        tmp1 = tmp11 + tmp12;\n"
"        tmp2 = tmp11 - tmp12;\n"
"        \n"
"        tmp4 = LOADSRC(1, src);\n"
"        tmp5 = LOADSRC(3, src);\n"
"        tmp6 = LOADSRC(5, src);\n"
"        tmp7 = LOADSRC(7, src);\n"
"        \n"
"        z13 = tmp6 + tmp5;        /* phase 6 */\n"
"        z10 = tmp6 - tmp5;\n"
"        z11 = tmp4 + tmp7;\n"
"        z12 = tmp4 - tmp7;\n"
"        \n"
"        tmp7 = z11 + z13;        /* phase 5 */\n"
"        tmp11 = (z11 - z13) * (float8)(1.414213562); /* 2*c4 */\n"
"        \n"
"        z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */\n"
"        tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
"        tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
"        \n"
"        tmp6 = tmp12 - tmp7;    /* phase 2 */\n"
"        tmp5 = tmp11 - tmp6;\n"
"        tmp4 = tmp10 + tmp5;\n"
"        \n"
"        tmp0 = tmp0 + tmp7;\n"
"        tmp7 = tmp0 - (float8)(2)*tmp7;\n"
"        tmp1 = tmp1 + tmp6;\n"
"        tmp6 = tmp1 - (float8)(2)*tmp6;\n"
"        tmp2 = tmp2 + tmp5;\n"
"        tmp5 = tmp2 - (float8)(2)*tmp5;\n"
"        tmp4 = tmp3 + tmp4;\n"
"        tmp3 = (float8)(2)*tmp3 - tmp4;\n"
"        /*Cross*/\n"
"#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
"        TRANS(w, 0);\n"
"        TRANS(w, 1);\n"
"        TRANS(w, 2);\n"
"        TRANS(w, 3);\n"
"        TRANS(w, 4);\n"
"        TRANS(w, 5);\n"
"        TRANS(w, 6);\n"
"        TRANS(w, 7);\n"
"#undef TRANS\n"
"        \n"
"        tmp10 = w0 + w4;\n"
"        tmp11 = w0 - w4;\n"
"        \n"
"        tmp13 = w2 + w6;\n"
"        tmp12 = (w2 - w6) * (float8)(1.414213562) - tmp13;\n"
"        \n"
"        tmp0 = tmp10 + tmp13;\n"
"        tmp3 = tmp10 - tmp13;\n"
"        tmp1 = tmp11 + tmp12;\n"
"        tmp2 = tmp11 - tmp12;\n"
"        \n"
"        z13 = w5 + w3;\n"
"        z10 = w5 - w3;\n"
"        z11 = w1 + w7;\n"
"        z12 = w1 - w7;\n"
"        \n"
"        tmp7 = z11 + z13;\n"
"        tmp11 = (z11 - z13) * (float8)(1.414213562);\n"
"        \n"
"        z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */\n"
"        tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
"        tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
"        \n"
"        tmp6 = tmp12 - tmp7;\n"
"        tmp5 = tmp11 - tmp6;\n"
"        tmp4 = tmp10 + tmp5;\n"
"        \n"
"        tmp0 = tmp0 + tmp7;\n"
"        tmp7 = tmp0 - (float8)(2)*tmp7;\n"
"        tmp1 = tmp1 + tmp6;\n"
"        tmp6 = tmp1 - (float8)(2)*tmp6;\n"
"        tmp2 = tmp2 + tmp5;\n"
"        tmp5 = tmp2 - (float8)(2)*tmp5;\n"
"        tmp4 = tmp3 + tmp4;\n"
"        tmp3 = (float8)(2)*tmp3 - tmp4;\n"
"        /*Cross*/\n"
"#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
"        TRANS(w, 0);\n"
"        TRANS(w, 1);\n"
"        TRANS(w, 2);\n"
"        TRANS(w, 3);\n"
"        TRANS(w, 4);\n"
"        TRANS(w, 5);\n"
"        TRANS(w, 6);\n"
"        TRANS(w, 7);\n"
"#undef TRANS\n"
"        /* Final output stage: scale down by a factor of 8 and range-limit */\n"
"#define RESULT(t) convert_uchar8(clamp((t)/(float8)(8)+(float8)(128), (float8)(0), (float8)(255)))\n"
"        vstore8(RESULT(w0), 0, outptr);\n"
"        vstore8(RESULT(w7), 7, outptr);\n"
"        vstore8(RESULT(w1), 1, outptr);\n"
"        vstore8(RESULT(w6), 6, outptr);\n"
"        vstore8(RESULT(w2), 2, outptr);\n"
"        vstore8(RESULT(w5), 5, outptr);\n"
"        vstore8(RESULT(w4), 4, outptr);\n"
"        vstore8(RESULT(w3), 3, outptr);\n"
"#undef RESULT\n"
"    }\n"
"}\n"
;
const char* yuv_rgb_clclh =
"#define DCTSIZE2 64\n"
"#define DCTSIZE 8\n"
"struct ComponentInfo\n"
"{\n"
"    int max_x_sample;\n"
"    int max_y_sample;\n"
"    int YW;\n"
"    int YH;\n"
"    int UW;\n"
"    int UH;\n"
"    int UOffset;\n"
"    int VW;\n"
"    int VH;\n"
"    int VOffset;\n"
"    int blocksInMCU;\n"
"    int MCU_Per_Row;\n"
"};\n"
"__kernel void yuv_rgb(__global unsigned char* yuvbuffer,__write_only image2d_t rgba, int output_stride,int max_x_sample,int max_y_sample,"
"int YW,int YH,int UW,int UH,int UOffset,int VW,int VH,int VOffset,int blocksInMCU,int MCU_Per_Row)\n"
"{\n"
" struct ComponentInfo info;\n"
"info.max_x_sample=max_x_sample;\n"
"info.max_y_sample=max_y_sample;\n"
"info.YW=YW;\n"
"info.YH=YH;\n""info.UW=UW;\n""info.UH=UH;\n""info.UOffset=UOffset;\n""info.VW=VW;\n""info.VH=VH;\n""info.VOffset=VOffset;\n""info.blocksInMCU=blocksInMCU;\n""info.MCU_Per_Row=MCU_Per_Row;\n"
"    int x = get_global_id(0);\n"
"    int y_origin = get_global_id(1);\n"
"    int yoffset = y_origin % DCTSIZE;\n"
"    int y = y_origin/DCTSIZE;\n"
"    int mcux = x/info.max_x_sample;\n"
"    int mcuy = y/info.max_y_sample;\n"
"    __global unsigned char* basic = yuvbuffer + DCTSIZE2*info.blocksInMCU*(info.MCU_Per_Row*mcuy + mcux);\n"
"    __global unsigned char* Y = basic + DCTSIZE2*((x%info.YW) + 2*(y%info.YH)) + DCTSIZE*yoffset;\n"
"    __global unsigned char* U = basic + DCTSIZE2*((x%info.UW) + 2*(y%info.UH)+info.UOffset) + DCTSIZE*yoffset;\n"
"    __global unsigned char* V = basic + DCTSIZE2*((x%info.VW) + 2*(y%info.VH)+info.VOffset) + DCTSIZE*yoffset;\n"
"    float8 yy = convert_float8(vload8(0, Y));\n"
"    float8 uu = convert_float8(vload8(0, U)) - (float8)(128);\n"
"    float8 vv = convert_float8(vload8(0, V)) - (float8)(128);\n"
"    float8 r, g, b;\n"
"#define RESULT(x) (clamp(x, (float8)(0), (float8)(255)))\n"
"    r = RESULT((yy + (float8)(1.40200)*vv)/(float8)255.0);\n"
"    g = RESULT((yy - (float8)(0.34414)*uu - (float8)(0.71414)*vv)/(float8)255.0);\n"
"    b = RESULT((yy + (float8)(1.77200)*uu)/(float8)255.0);\n"
"#undef RESULT\n"
"    int2 coord= (int2)(x*DCTSIZE, y_origin);\n""    float4 color=(float4)(r.s0, g.s0, b.s0, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+1, y_origin);\n""    color=(float4)(r.s1, g.s1, b.s1, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+2, y_origin);\n""    color=(float4)(r.s2, g.s2, b.s2, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+3, y_origin);\n""    color=(float4)(r.s3, g.s3, b.s3, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+4, y_origin);\n""    color=(float4)(r.s4, g.s4, b.s4, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+5, y_origin);\n""    color=(float4)(r.s5, g.s5, b.s5, 0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+6, y_origin);\n""    color=(float4)(r.s6, g.s6, b.s6,0);\n""    write_imagef(rgba,coord,color);\n""    coord= (int2)(x*DCTSIZE+7, y_origin);\n""    color=(float4)(r.s7, g.s7, b.s7, 0);\n""    write_imagef(rgba,coord,color);\n"
"}\n"
;

转载请标明出处:https://blog.csdn.net/u013752202/article/details/92794209

Android使用GPU加速JPEG图片解码(Opencl)相关推荐

  1. Vulkan Video实现GPU加速视频编码/解码

    正文字数:929  阅读时长:2分钟 Vulkan是一套跨平台的图形API,由Khronos组织牵头进行制定,普遍被看作是OpenGL的后继者,目前版本已经来到1.2.175,仍然在不停地进行更新,其 ...

  2. 强制开启Android Webview GPU 加速的方法

    强制开启Android Webview GPU 加速的方法 常用方法 浏览器开启GPU加速可以让渲染的性能更好,可以有效的利用硬件的能力来提高页面的绘制帧率.在安卓平台使用webview的场景下,开发 ...

  3. 如何使用nVidia Falcor渲染框架进行GPU加速的图片处理

    图片处理在很多领域中都有很大的需求,比如计算机视觉等.比较简单且常用的方法是用openCV读取图片,然后用numpy进行图片处理.然而,只有当处理算法能够完全用numpy中提供的矩阵操作实现时,才能够 ...

  4. 【2022年第一期 CANN训练营学习笔记】进阶班应用开发课 大作业1-开发DVPP应用,输入,输出都是JPEG图片,且分辨率不同

    1.大作业1题目如下:开发DVPP应用,输入,输出都是JPEG图片,且分辨率不同. 根据作业提示,转换的思路如下: 原始JPEG图片->JPEG解码->YUV Resize->JPE ...

  5. stm32h743单片机嵌入式学习笔记6-压缩图片解码原理

    软件解码: JPEG/JPG 的解码过程可以简单的概述为如下几个部分: 1 .从文件头读出文件的相关信息. JPEG 文件数据分为文件头和图像数据两大部分,其中文件头记录了图像的版本. 长宽.采样因子 ...

  6. 使用GPU进行视频编解码

    GPU视频编解码研究现状 迄今为止,已有许多关于使用GPU加速视频编解码的文章发表,如下表所示.目前GPU加速视频编码的主要集中在运动估计(ME,Motion Estimate),运动补偿(MC,Mo ...

  7. java jpeg压缩解码_图片压缩(iOS)

    场景很简单,上传图片前压缩图片,节省流量和发图时间.最近看了看 iOS 的静态图片压缩,这里记个笔记.本人之前没学过 iOS 和 Swift,本文是一篇入门文章,描述不到位之处请大家多多批评斧正. ̄ω ...

  8. 数字图像处理、拼接,图像静态滤镜(GPUImage/GPU加速) - Android

    图像滤镜处理的两种方式:RGB点乘运算:GPU的矩阵运算(效率更高).图片处理中的计算:RGBA~利用自带的方法修改色调,饱和度,亮度来修改图片:矩阵~利用矩阵计算得到新的矩阵修改图片. 几个图像像素 ...

  9. android全平台编译libjpeg-turbo并基于ANativeWindow加载JPEG图片

    图形图像实践 android全平台编译libjpeg-turbo并基于ANativeWindow加载JPEG图片 android全平台编译libpng并基于ANativeWindow加载PNG图片 概 ...

最新文章

  1. Java并发面试,幸亏有点道行,不然又被忽悠了
  2. 高薪诚聘 | 匠韵智能招聘3D视觉工程师
  3. Linux Shell常用技巧(三) sed
  4. 自学python有用吗-文科专业自学python有用吗?
  5. vectornator安卓_Vectornator Pro
  6. ajax数据字符串拼接,ajax请求到后台数据,前台不用拼接字符串,一样显示到页面...
  7. 两个分布的特征映射_跨语言分布表示学习方法概述
  8. 【白皮书分享】2022新职业教育洞察白皮书:“职”成机遇,“育”见未来.pdf...
  9. libpcap 中的 struct block
  10. 计算机导论以python为舟_计算机科学导论
  11. Packer Terraform 让 ESS 更灵活
  12. 计算机信息专业致谢词,计算机专业毕业论文致谢词
  13. swustoj 1132 Coin-collecting by robot
  14. 深入理解计算机系统——地址翻译
  15. 莫队算法+带修莫队+回滚莫队
  16. 描述性物理海洋学--第一章学习笔记
  17. bp是什么意思贷款利率,利率上bp是什么意思
  18. 毫米和像素怎么换算_图片的像素和毫米之间是怎么换算的有公式吗
  19. 十月,再见;你好,十一月
  20. html网站手机最小字体大小,html在手机浏览器如何设置字体大小

热门文章

  1. 关于C#对AutoCad2007二次开发插件工作总结
  2. XP完美模拟Vista全攻略
  3. 通过QQ查看对方地址
  4. Microsoft Office 2010安装
  5. (15)tensorflow全连接层的张量实现
  6. 红米2屏幕显示android,水滴全面屏老人机:红米 8A
  7. 凯图 CADTool2000(天喻CAD) 1CD钣金展开软件AP100 v5.1 中文版 1DVD
  8. 人工智能:遗传算法稀布阵列天线
  9. 伺服驱动器和电机控制
  10. Mac虚拟机免费版下载 马上搞定