首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >MJPEG中IQZZ和IDCT的OpenCL实现

MJPEG中IQZZ和IDCT的OpenCL实现
EN

Code Review用户
提问于 2017-08-30 21:14:44
回答 1查看 1.2K关注 0票数 13

我正在使用代码进行MJPEG解码,我试图使两个功能(IQZZ和IDCT)在GPU (NVIDIA k20c)上运行得更快。我正在使用OpenCL框架来完成这项任务。

我已经成功地将这些函数卸载到GPU,并且正在获得预期的输出。但是,在将代码卸载到GPU之后,输出视频非常慢。

我的.cl文件如下:

代码语言:javascript
复制
/******************************* IDCT *************************************/

void idct_1D(__local int *Y);

__kernel void IDCT(__global int* input, __global uchar* output) 
{
unsigned int kid= get_global_id(0);

    __local int Y[64]; 
    int k= get_global_id(0);
    int l;
    int lid= get_global_id(1);
    __local int Yc[8];

   if (k < 8)
    {
        for (l = 0; l < 8; l++) 

     {
     Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
     }
        idct_1D(&Y(k, 0));
    }

    if (lid < 8)
    {

        for (k = 0; k < 8; k++)
    {
            Yc[k] = Y(k, lid);
    }

        idct_1D(Yc);

        for (k = 0; k < 8; k++)
        {
            int r = 128 + DESCALE(Yc[k], S_BITS + 3);
            r = r > 0 ? (r < 255 ? r : 255) : 0;
            X(k, lid) = r;
        }

    }
}

void idct_1D(__local int *Y) 
{

int z1[8], z2[8], z3[8];


    but(Y[0], Y[4], z1[1], z1[0]);
    rot(1, 6, Y[2], Y[6], &z1[2], &z1[3]);
    but(Y[1], Y[7], z1[4], z1[7]);
    z1[5] = CMUL(sqrt2, Y[3]);
    z1[6] = CMUL(sqrt2, Y[5]);

    but(z1[0], z1[3], z2[3], z2[0]);
    but(z1[1], z1[2], z2[2], z2[1]);
    but(z1[4], z1[6], z2[6], z2[4]);
    but(z1[7], z1[5], z2[5], z2[7]);

    z3[0] = z2[0];
    z3[1] = z2[1];
    z3[2] = z2[2];
    z3[3] = z2[3];
    rot(0, 3, z2[4], z2[7], &z3[4], &z3[7]);
    rot(0, 1, z2[5], z2[6], &z3[5], &z3[6]);

    but(z3[0], z3[7], Y[7], Y[0]);
    but(z3[1], z3[6], Y[6], Y[1]);
    but(z3[2], z3[5], Y[5], Y[2]);
    but(z3[3], z3[4], Y[4], Y[3]);
} 

/*---------------IQZZ----------------------------*/

__kernel void iqzz_block(__global int in[64], __global int out[64],
        __global uchar table[64])
{
    uint index= get_global_id(0);
    int priv_in[64];
    uchar priv_table[64];

    int priv_out[64];


    if (index < 64)
    {   

    priv_in[index]= in[index];

    priv_table[index]= table[index];

        priv_out[G_ZZ[index]] = priv_in[index] * priv_table[index];

        out[G_ZZ[index]]= priv_out[G_ZZ[index]];

    }
}

对于IDCT,我只需从.c文件中复制和粘贴常量。对于简洁性,我还没有在查询中包含常量。有关常量的详细信息可以找到这里

main.c中,我简单地用OpenCL命令替换了函数调用,以便将数据传输到设备,在那里执行内核并在CPU上传输结果。

我的main.c看起来是这样的:

代码语言:javascript
复制
/* Get Platform */
ret= clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

/* Get Device */
ret= clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

 /* Create Context */
context = clCreateContext(0, 1, &device_id, NULL, NULL, &ret);

/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);


ret= clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

//--------kernel for iqzz-----------//
kernel= clCreateKernel(program, "iqzz_block", &ret);

//-------kernel for idct-----------//
cos_kernel= clCreateKernel(program, "IDCT", &ret);

cl_mem block_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_int), NULL, &ret);


//This will serve as the output buffer for iqzz
cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);
chk(ret, "clCreateBuffer");

//Output buffer
cl_mem  DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);

//Regular code from main.c follows............

case M_SOS:

//regular code from main.c.......

//The Relevant part starts here......

for (index_X = 0; index_X < nb_MCU_X; index_X++) {

for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

for (index = 0; index < SOS_section.n; index++)
{

int component_index = component_order[index];

int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf)*(SOF_component[component_index].HV & 0x0f);

for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++)
{
unpack_block(movie, &scan_desc,index, MCU);

/////--------------Transfer data to buffers----------------////////////

ret = clEnqueueWriteBuffer(command_queue, block_GPU, CL_TRUE, 0, 64 * sizeof(cl_int), MCU, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, qtable_GPU, CL_TRUE, 0, 64 * sizeof(cl_uchar), DQT_table[SOF_component[component_index].q_table], 0, NULL, NULL);

cl_mem qtable_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_uchar), NULL, &ret);

/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&block_GPU);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&DCT_Input);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&qtable_GPU);

start_time = wtime();

size_t global=64;
size_t local= 16;

ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);

run_time += wtime() - start_time;

//Copy result from device to host
ret = clEnqueueReadBuffer(command_queue, DCT_Input, CL_TRUE, 0, 64 * sizeof(cl_int), &unZZ_MCU, 0, NULL, NULL);

/////---------------IDCT-----------------//////

ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);

//No. of work-items
const size_t globalForInverseDCT[2]= {8, 8};

ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 1, NULL, &globalForInverseDCT, &localForInverseDCT, 0, NULL, NULL);

ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);

}                       
upsampler(YCbCr_MCU_ds[component_index],YCbCr_MCU[component_index],Horizontal,Vertical,max_ss_h,max_ss_v);
}

//more code which is not immediately relevant follows......

}

如何修改我的iqzz和idct内核,使它们在GPU上运行得更快?

我的GPU的详细信息如下:

代码语言:javascript
复制
DEVICE_NAME = Tesla K20c
DEVICE_VENDOR = NVIDIA Corporation
DEVICE_VERSION = OpenCL 1.2 CUDA
DRIVER_VERSION = 352.21
DEVICE_MAX_COMPUTE_UNITS = 13
DEVICE_MAX_CLOCK_FREQUENCY = 705
DEVICE_GLOBAL_MEM_SIZE = 5032706048
CL_DEVICE_ERROR_CORRECTION_SUPPORT: yes
CL_DEVICE_LOCAL_MEM_TYPE:       local
CL_DEVICE_LOCAL_MEM_SIZE:       48 KByte
CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: 64 KByte
CL_DEVICE_QUEUE_PROPERTIES:     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
CL_DEVICE_QUEUE_PROPERTIES:     CL_QUEUE_PROFILING_ENABLE
EN

回答 1

Code Review用户

回答已采纳

发布于 2017-11-01 23:49:13

  1. 缩进你的环状身体。
  2. 实际上,检查ret --您每次都在无谓地分配和丢弃它。
  3. 使用更好的变量名称:避免使用单个字母(Ykl)和泛型名称(index)
  4. 代码的所有工作似乎都在四个嵌套循环中:
    1. 尝试向量化-重写内部块,同时对多个像素/组件/色度进行操作。
    2. 优化缓存和分支预测的迭代顺序。
    3. 提取任何实际上不需要在内部循环中的内容(比如运行时计算?)

卸载到GPU的主要问题是数据传输非常慢。您需要最大限度地减少GPU与GPU之间的副本数量,并最大限度地利用其并行性。如果您不能这样做,那么保持在CPU上的速度总是会更快(特别是使用SIMD);即使单个操作比较慢,它仍然可以更快地通过数据。

票数 14
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/174433

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档