非交换算子的多块并行约简
并行缩减的多块方法与单块方法非常相似。必须将全局输入数组拆分为多个部分,每个部分由一个块缩小。当获得每个块的部分结果时,一个最终块减少这些以获得最终结果。
- 在单块缩减示例中更详细地解释了
sumNoncommSingleBlock
。 lastBlock
只接受到达它的最后一个块。如果要避免这种情况,可以将内核拆分为两个单独的调用。
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.
__device__ bool lastBlock(int* counter) {
__threadfence(); //ensure that partial result is visible by all blocks
int last = 0;
if (threadIdx.x == 0)
last = atomicAdd(counter, 1);
return __syncthreads_or(last == gridDim.x-1);
}
__device__ void sumNoncommSingleBlock(const int* gArr, int arraySize, int* out) {
int thIdx = threadIdx.x;
__shared__ int shArr[blockSize*2];
__shared__ int offset;
shArr[thIdx] = thIdx<arraySize ? gArr[thIdx] : 0;
if (thIdx == 0)
offset = blockSize;
__syncthreads();
while (offset < arraySize) { //uniform
shArr[thIdx + blockSize] = thIdx+offset<arraySize ? gArr[thIdx+offset] : 0;
__syncthreads();
if (thIdx == 0)
offset += blockSize;
int sum = shArr[2*thIdx] + shArr[2*thIdx+1];
__syncthreads();
shArr[thIdx] = sum;
}
__syncthreads();
for (int stride = 1; stride<blockSize; stride*=2) { //uniform
int arrIdx = thIdx*stride*2;
if (arrIdx+stride<blockSize)
shArr[arrIdx] += shArr[arrIdx+stride];
__syncthreads();
}
if (thIdx == 0)
*out = shArr[0];
}
__global__ void sumNoncommMultiBlock(const int* gArr, int* out, int* lastBlockCounter) {
int arraySizePerBlock = wholeArraySize/gridSize;
const int* gArrForBlock = gArr+blockIdx.x*arraySizePerBlock;
int arraySize = arraySizePerBlock;
if (blockIdx.x == gridSize-1)
arraySize = wholeArraySize - blockIdx.x*arraySizePerBlock;
sumNoncommSingleBlock(gArrForBlock, arraySize, &out[blockIdx.x]);
if (lastBlock(lastBlockCounter))
sumNoncommSingleBlock(out, gridSize, out);
}
理想情况下,想要在完全占用时启动足够的块以使 GPU 上的所有多处理器饱和。超过这个数字 - 特别是,启动与数组中的元素一样多的线程 - 会适得其反。这样做不会再增加原始计算能力,但会阻止使用非常有效的第一个循环。