// Mapping a continous section of threads to the XY positions for (unsignedint stride = 1; stride <= blockDim.x; stride *= 2) { __syncthreads(); unsignedint index = (threadIdx.x + 1) * 2 * stride - 1; // index of the left child if (index < SECTION_SIZE) { XY[index] += XY[index - stride]; } }
#define CORASE_FACTOR 4 #define SUBSECTION_SIZE (SECTION_SIZE / CORASE_FACTOR) __global__ voidCorasened_Scan_Kernel(int* X, int* Y, unsignedint N){ // Partition X into blockDim.x subsections
// Load X into shared memory in coalesced fashion __shared__ float XY[SECTION_SIZE]; __shared__ float subXY[SUBSECTION_SIZE]; for (int i = 0; i < SECTION_SIZE; i+= blockDim.x) { XY[threadIdx.x + i] = X[threadIdx.x + i]; } __syncthreads();
// Part 1: Compute prefix sum of each subsection in sequenial for (int i = 1; i < SUBSECTION_SIZE; i++) { XY[threadIdx.x * SUBSECTION_SIZE + i] += XY[threadIdx.x * SUBSECTION_SIZE + i - 1]; } __syncthreads();
// Part 2: Compute prefix sum of the last element of each subsection in parallel unsignedint lastElemId = (blockIdx.x + 1) * blockDim.x * CORASE_FACTOR - 1; subXY[threadIdx.x] = XY[(threadIdx.x + 1) * SUBSECTION_SIZE - 1]; float temp = 0.0f; for (int stride = 1; stride < SUBSECTION_SIZE; stride *= 2) { __syncthreads(); if (threadIdx.x >= stride) { temp = subXY[threadIdx.x] + subXY[threadIdx.x - stride]; } __syncthreads(); if (threadIdx.x >= stride) { subXY[threadIdx.x] = temp; } } __syncthreads();
// Part 3: Add the reduction sum of the previous subsection to the current subsection (except the last element) for (int i = 1; i < SUBSECTION_SIZE - 1; i++) { XY[threadIdx.x * SUBSECTION_SIZE + i] += subXY[threadIdx.x]; } __syncthreads();
// Store back to Y for (int i = 0; i < SECTION_SIZE; i+= blockDim.x) { Y[threadIdx.x + i] = XY[threadIdx.x + i]; } }
11.6 Segmented Parallel Scan for Arbitrary-length Inputs