交换算子的多块并行约简
与单块方法相比,CUDA 中并行减少的多块方法带来了额外的挑战,因为块在通信方面受到限制。我们的想法是让每个块计算输入数组的一部分,然后有一个最终块来合并所有部分结果。要做到这一点,可以启动两个内核,隐式创建一个网格范围的同步点。
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out);
//dev_out now holds the partial result
sumCommMultiBlock<<<1, blockSize>>>(dev_out, gridSize, dev_out);
//dev_out[0] now holds the final result
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
理想情况下,想要在完全占用时启动足够的块以使 GPU 上的所有多处理器饱和。超过这个数字 - 特别是,启动与数组中的元素一样多的线程 - 会适得其反。这样做不会再增加原始计算能力,但会阻止使用非常有效的第一个循环。
在 last-block guard 的帮助下,使用单个内核也可以获得相同的结果 :
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24;
__device__ bool lastBlock(int* counter) {
__threadfence(); //ensure that partial result is visible by all blocks
int last = 0;
if (threadIdx.x == 0)
last = atomicAdd(counter, 1);
return __syncthreads_or(last == gridDim.x-1);
}
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut, int* lastBlockCounter) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
if (lastBlock(lastBlockCounter)) {
shArr[thIdx] = thIdx<gridSize ? gOut[thIdx] : 0;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[0] = shArr[0];
}
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
int* dev_lastBlockCounter;
cudaMalloc((void**)&dev_lastBlockCounter, sizeof(int));
cudaMemset(dev_lastBlockCounter, 0, sizeof(int));
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out, dev_lastBlockCounter);
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
请注意,内核可以更快,例如通过使用 warp 级并行缩减。