我正在使用这代码进行MJPEG解码,我试图使两个功能(IQZZ和IDCT)在GPU (NVIDIA k20c)上运行得更快。我正在使用OpenCL框架来完成这项任务。
我已经成功地将这些函数卸载到GPU,并且正在获得预期的输出。但是,在将代码卸载到GPU之后,输出视频非常慢。
我的.cl文件如下:
/******************************* IDCT *************************************/
void idct_1D(__local int *Y);
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
__local int Y[64];
int k= get_global_id(0);
int l;
int lid= get_global_id(1);
__local int Yc[8];
if (k < 8)
{
for (l = 0; l < 8; l++)
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k, 0));
}
if (lid < 8)
{
for (k = 0; k < 8; k++)
{
Yc[k] = Y(k, lid);
}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, lid) = r;
}
}
}
void idct_1D(__local int *Y)
{
int z1[8], z2[8], z3[8];
but(Y[0], Y[4], z1[1], z1[0]);
rot(1, 6, Y[2], Y[6], &z1[2], &z1[3]);
but(Y[1], Y[7], z1[4], z1[7]);
z1[5] = CMUL(sqrt2, Y[3]);
z1[6] = CMUL(sqrt2, Y[5]);
but(z1[0], z1[3], z2[3], z2[0]);
but(z1[1], z1[2], z2[2], z2[1]);
but(z1[4], z1[6], z2[6], z2[4]);
but(z1[7], z1[5], z2[5], z2[7]);
z3[0] = z2[0];
z3[1] = z2[1];
z3[2] = z2[2];
z3[3] = z2[3];
rot(0, 3, z2[4], z2[7], &z3[4], &z3[7]);
rot(0, 1, z2[5], z2[6], &z3[5], &z3[6]);
but(z3[0], z3[7], Y[7], Y[0]);
but(z3[1], z3[6], Y[6], Y[1]);
but(z3[2], z3[5], Y[5], Y[2]);
but(z3[3], z3[4], Y[4], Y[3]);
}
/*---------------IQZZ----------------------------*/
__kernel void iqzz_block(__global int in[64], __global int out[64],
__global uchar table[64])
{
uint index= get_global_id(0);
int priv_in[64];
uchar priv_table[64];
int priv_out[64];
if (index < 64)
{
priv_in[index]= in[index];
priv_table[index]= table[index];
priv_out[G_ZZ[index]] = priv_in[index] * priv_table[index];
out[G_ZZ[index]]= priv_out[G_ZZ[index]];
}
}对于IDCT,我只需从.c文件中复制和粘贴常量。对于简洁性,我还没有在查询中包含常量。有关常量的详细信息可以找到这里。
在main.c中,我简单地用OpenCL命令替换了函数调用,以便将数据传输到设备,在那里执行内核并在CPU上传输结果。
我的main.c看起来是这样的:
/* Get Platform */
ret= clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
/* Get Device */
ret= clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
/* Create Context */
context = clCreateContext(0, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret= clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//--------kernel for iqzz-----------//
kernel= clCreateKernel(program, "iqzz_block", &ret);
//-------kernel for idct-----------//
cos_kernel= clCreateKernel(program, "IDCT", &ret);
cl_mem block_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_int), NULL, &ret);
//This will serve as the output buffer for iqzz
cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);
chk(ret, "clCreateBuffer");
//Output buffer
cl_mem DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
//Regular code from main.c follows............
case M_SOS:
//regular code from main.c.......
//The Relevant part starts here......
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++)
{
int component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf)*(SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++)
{
unpack_block(movie, &scan_desc,index, MCU);
/////--------------Transfer data to buffers----------------////////////
ret = clEnqueueWriteBuffer(command_queue, block_GPU, CL_TRUE, 0, 64 * sizeof(cl_int), MCU, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, qtable_GPU, CL_TRUE, 0, 64 * sizeof(cl_uchar), DQT_table[SOF_component[component_index].q_table], 0, NULL, NULL);
cl_mem qtable_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_uchar), NULL, &ret);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&block_GPU);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&DCT_Input);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&qtable_GPU);
start_time = wtime();
size_t global=64;
size_t local= 16;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
run_time += wtime() - start_time;
//Copy result from device to host
ret = clEnqueueReadBuffer(command_queue, DCT_Input, CL_TRUE, 0, 64 * sizeof(cl_int), &unZZ_MCU, 0, NULL, NULL);
/////---------------IDCT-----------------//////
ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
//No. of work-items
const size_t globalForInverseDCT[2]= {8, 8};
ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 1, NULL, &globalForInverseDCT, &localForInverseDCT, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);
}
upsampler(YCbCr_MCU_ds[component_index],YCbCr_MCU[component_index],Horizontal,Vertical,max_ss_h,max_ss_v);
}
//more code which is not immediately relevant follows......
}如何修改我的iqzz和idct内核,使它们在GPU上运行得更快?
我的GPU的详细信息如下:
DEVICE_NAME = Tesla K20c
DEVICE_VENDOR = NVIDIA Corporation
DEVICE_VERSION = OpenCL 1.2 CUDA
DRIVER_VERSION = 352.21
DEVICE_MAX_COMPUTE_UNITS = 13
DEVICE_MAX_CLOCK_FREQUENCY = 705
DEVICE_GLOBAL_MEM_SIZE = 5032706048
CL_DEVICE_ERROR_CORRECTION_SUPPORT: yes
CL_DEVICE_LOCAL_MEM_TYPE: local
CL_DEVICE_LOCAL_MEM_SIZE: 48 KByte
CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: 64 KByte
CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE发布于 2017-11-01 23:49:13
ret --您每次都在无谓地分配和丢弃它。Y、k、l)和泛型名称(index)卸载到GPU的主要问题是数据传输非常慢。您需要最大限度地减少GPU与GPU之间的副本数量,并最大限度地利用其并行性。如果您不能这样做,那么保持在CPU上的速度总是会更快(特别是使用SIMD);即使单个操作比较慢,它仍然可以更快地通过数据。
https://codereview.stackexchange.com/questions/174433
复制相似问题