I am working with an RTX 2080 max-q mobile, compute capability 7.5. I am trying to understand why I cannot achieve 32bit/cycle shared memory bandwidth:
From https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-5-x
Shared memory has 32 banks that are organized such that successive 32-bit words map to successive banks. Each bank has a bandwidth of 32 bits per clock cycle.
I’ve written this test code:
#include <iostream>
#include <algorithm>
#include <numeric>
using T = float;
extern __shared__ T bank[];
constexpr int warps = 8;
constexpr int pitch = 32 * warps;
constexpr int size = 32;
__managed__ long long starts[pitch];
__managed__ long long stops[pitch];
__managed__ long long clocks[pitch];
__global__ void kernel()
{
auto* local_bank = bank + threadIdx.x;
auto* a = local_bank;
auto* b = a + size * pitch;
__syncwarp();
auto start = clock64();
__syncwarp();
for (int i = 0; i < size; i++)
b[i * pitch] = a[i * pitch];
__syncwarp();
auto stop = clock64();
__syncwarp();
auto duration = stop - start;
printf("%5lld %s", duration, threadIdx.x % 32 == 31 ? "\n" : "");
__syncthreads();
starts[threadIdx.x + blockDim.x *blockIdx.x] = start;
stops[threadIdx.x + blockDim.x *blockIdx.x] = stop;
clocks[threadIdx.x + blockDim.x *blockIdx.x] = duration;
}
int main()
{
cudaDeviceSetLimit(cudaLimitStackSize, 64 * 1024);
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 64*1024);
cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
kernel<<<1, pitch, 2 * size * pitch * sizeof(T)>>>();
cudaDeviceSynchronize();
auto min_clock = *std::min_element(std::begin(clocks), std::end(clocks));
auto min_start = *std::min_element(std::begin(starts), std::end(starts));
auto max_stop = *std::max_element(std::begin(stops), std::end(stops));
auto avg_clock = std::accumulate(std::begin(clocks), std::end(clocks), 0) / (float)(std::end(clocks) - std::begin(clocks));
auto avg_clock_per_access = avg_clock / (float)(warps * size * 2);
printf("min = %lli\n", min_clock);
printf("max = %lli\n", max_stop - min_start);
printf("avg = %f\n", avg_clock_per_access);
}
It measures in clock cycles the duration of the shared memory copy for each thread and outputs the avg number of cycles taken by a shared memory access. I get 2 cycles and it should be 1 cycle. I don’t understand what I’m doing wrong.
Note: I have tried with float4 and I got the same results. You can change the type T to float4 and reduce size to 8 so that you don’t use more than 64kb of shared memory.