交換運算元的多塊並行約簡
與單塊方法相比,CUDA 中並行減少的多塊方法帶來了額外的挑戰,因為塊在通訊方面受到限制。我們的想法是讓每個塊計算輸入陣列的一部分,然後有一個最終塊來合併所有部分結果。要做到這一點,可以啟動兩個核心,隱式建立一個網格範圍的同步點。
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out);
//dev_out now holds the partial result
sumCommMultiBlock<<<1, blockSize>>>(dev_out, gridSize, dev_out);
//dev_out[0] now holds the final result
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
理想情況下,想要在完全佔用時啟動足夠的塊以使 GPU 上的所有多處理器飽和。超過這個數字 - 特別是,啟動與陣列中的元素一樣多的執行緒 - 會適得其反。這樣做不會再增加原始計算能力,但會阻止使用非常有效的第一個迴圈。
在 last-block guard 的幫助下,使用單個核心也可以獲得相同的結果 :
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24;
__device__ bool lastBlock(int* counter) {
__threadfence(); //ensure that partial result is visible by all blocks
int last = 0;
if (threadIdx.x == 0)
last = atomicAdd(counter, 1);
return __syncthreads_or(last == gridDim.x-1);
}
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut, int* lastBlockCounter) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
if (lastBlock(lastBlockCounter)) {
shArr[thIdx] = thIdx<gridSize ? gOut[thIdx] : 0;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[0] = shArr[0];
}
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
int* dev_lastBlockCounter;
cudaMalloc((void**)&dev_lastBlockCounter, sizeof(int));
cudaMemset(dev_lastBlockCounter, 0, sizeof(int));
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out, dev_lastBlockCounter);
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
請注意,核心可以更快,例如通過使用 warp 級並行縮減。