你好,我是JOCL (opencl)的新手。我编写这段代码是为了获取每个图像强度的之和。内核采用一个一维数组,将所有图像的所有像素放在一起。一幅图像是300x300,所以每幅图像是90000像素。现在它比我按顺序做这件事要慢。
我的代码
package PAR;
/*
* JOCL - Java bindings for OpenCL
*
* Copyright 2009 Marco Hutter - http://www.jocl.org/
*/
import IMAGE_IO.ImageReader;
import IMAGE_IO.Input_Folder;
import static org.jocl.CL.*;
import org.jocl.*;
/**
* A small JOCL sample.
*/
public class IPPARA {
/**
* The source code of the OpenCL program to execute
*/
private static String programSource =
"__kernel void "
+ "sampleKernel(__global uint *a,"
+ " __global uint *c)"
+ "{"
+ "__private uint intensity_core=0;"
+ " uint i = get_global_id(0);"
+ " for(uint j=i*90000; j < (i+1)*90000; j++){ "
+ " intensity_core += a[j];"
+ " }"
+ "c[i]=intensity_core;"
+ "}";
/**
* The entry point of this sample
*
* @param args Not used
*/
public static void main(String args[]) {
long numBytes[] = new long[1];
ImageReader imagereader = new ImageReader() ;
int srcArrayA[] = imagereader.readImages();
int size[] = new int[1];
size[0] = srcArrayA.length;
long before = System.nanoTime();
int dstArray[] = new int[size[0]/90000];
Pointer srcA = Pointer.to(srcArrayA);
Pointer dst = Pointer.to(dstArray);
// Obtain the platform IDs and initialize the context properties
System.out.println("Obtaining platform...");
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
// Create an OpenCL context on a GPU device
cl_context context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null) {
// If no context for a GPU device could be created,
// try to create one for a CPU device.
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null) {
System.out.println("Unable to create a context");
return;
}
}
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Get the list of GPU devices associated with the context
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);
// Create a command-queue
cl_command_queue commandQueue =
clCreateCommandQueue(context, devices[0], 0, null);
// Allocate the memory objects for the input- and output data
cl_mem memObjects[] = new cl_mem[2];
memObjects[0] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
Sizeof.cl_uint * srcArrayA.length, srcA, null);
memObjects[1] = clCreateBuffer(context,
CL_MEM_READ_WRITE,
Sizeof.cl_uint * (srcArrayA.length/90000), null, null);
// Create the program from the source code
cl_program program = clCreateProgramWithSource(context,
1, new String[]{programSource}, null, null);
// Build the program
clBuildProgram(program, 0, null, null, null, null);
// Create the kernel
cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0,
Sizeof.cl_mem, Pointer.to(memObjects[0]));
clSetKernelArg(kernel, 1,
Sizeof.cl_mem, Pointer.to(memObjects[1]));
// Set the work-item dimensions
long local_work_size[] = new long[]{1};
long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
global_work_size, local_work_size, 0, null, null);
// Read the output data
clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,
(srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);
// Release kernel, program, and memory objects
clReleaseMemObject(memObjects[0]);
clReleaseMemObject(memObjects[1]);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
long after = System.nanoTime();
System.out.println("Time: " + (after - before) / 1e9);
}
}根据答案中的建议,通过CPU的并行代码几乎与顺序代码一样快。还有什么可以改进的地方吗?
发布于 2012-11-24 17:05:40
for(uint j=i*90000; j < (i+1)*90000; j++){ "
+ " c[i] += a[j];"1)您正在使用全局内存(c[])进行求和,这是缓慢的。使用私有变量使其更快。就像这样:
"__kernel void "
+ "sampleKernel(__global uint *a,"
+ " __global uint *c)"
+ "{"
+ "__private uint intensity_core=0;" <---this is a private variable of each core
+ " uint i = get_global_id(0);"
+ " for(uint j=i*90000; j < (i+1)*90000; j++){ "
+ " intensity_core += a[j];" <---register is at least 100x faster than global memory
//but we cannot get rid of a[] so the calculation time cannot be less than %50
+ " }"
+ "c[i]=intensity_core;"
+ "}"; //expecting %100 speedup现在,你有几个图像阵列的总和强度。
您的本地工作大小是1,那么如果您有至少160张图像(这是您的gpu的核心号码),那么计算将使用所有的内核.。
您将需要90000*num_images时间读和num_images写以及90000*num_images寄存器读写。使用寄存器将使内核时间减半。
(2)每2个内存访问,你只做一个数学运算。你需要每一个内存至少有10个数学,才能使用你gpu峰值Gflops的一小部分(6490 M的250 Gflops峰值)。
您的i7 cpu可以轻松地拥有100个Gflops,但是您的内存将成为瓶颈。当您通过pci-express发送整个数据时,情况就更糟了。(HD Graphics 3000的评级为125 GFLOPS)
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
cl_device_id device = devices[deviceIndex];
//one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu
//devices[2]-->HD3000在你的节目中:
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);接受第一个设备可能是gpu。
发布于 2012-11-26 16:34:03
您应该使用每300x300映像的整个工作组。这将有助于饱和gpu核心,并让您使用本地内存。内核还应该能够同时处理设备上的计算单元一样多的图像。
下面的内核分三个步骤完成您的缩减。
定义WG_MAX_SIZE是因为我不喜欢传入可变大小的本地内存块。值为64,因为这是在大多数平台上使用的一个很好的值。如果要尝试使用更大的工作组,请确保将此值设置得更高。小于WG_MAX_SIZE的工作组仍然可以正常工作。
#define WORK_SIZE 90000
#define WG_MAX_SIZE 64
__kernel void sampleKernel(__global uint *a, __global uint *c)
{
local uint intensity_core[WG_MAX_SIZE];
private uint workItemIntensity = 0;
int gid = get_group_id(0);
int lid = get_local_id(0);
int wgsize = get_local_size(0);
int i;
for(i=gid*WORK_SIZE; i < (gid+1)*WORK_SIZE; i+=wgsize){
workItemIntensity += a[j];
}
intensity_core[lid] = workItemIntensity;
mem_fence(CLK_LOCAL_MEM_FENCE);
//option #1
//loop to reduce the final values O(n) time
if(lid == 0){
for(i=1;i<wgsize;i++){
workItemIntensity += intensity_core[i];
}
c[gid]=intensity_core;
}
//option #2
//O(logn) time reduction
//assumes work group size is a power of 2
int steps = 32 - clz(wgsize);
for(i=1;i<steps;i++){
if(lid % (1 << i) == 0){
intensity_core[lid] += intensity_core[i<<(i-1)];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0){
c[gid]=intensity_core[0];
}
}https://stackoverflow.com/questions/13543248
复制相似问题