Cannot achieve max shared memory bandwith

I am working with an RTX 2080 max-q mobile, compute capability 7.5. I am trying to understand why I cannot achieve 32bit/cycle shared memory bandwidth:

From https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-5-x

Shared memory has 32 banks that are organized such that successive 32-bit words map to successive banks. Each bank has a bandwidth of 32 bits per clock cycle.

I’ve written this test code:

#include <iostream>
#include <algorithm>
#include <numeric>

using T = float;
extern __shared__ T bank[];

constexpr int warps = 8;
constexpr int pitch = 32 * warps;
constexpr int size = 32;

__managed__ long long starts[pitch];
__managed__ long long stops[pitch];
__managed__ long long clocks[pitch];

__global__ void kernel()
{
    auto* local_bank = bank + threadIdx.x;
    auto* a = local_bank;
    auto* b = a + size * pitch;

    __syncwarp();
    auto start = clock64();
    __syncwarp();

    for (int i = 0; i < size; i++)
        b[i * pitch] = a[i * pitch];

    __syncwarp();
    auto stop = clock64();
    __syncwarp();

    auto duration = stop - start;
    printf("%5lld %s", duration, threadIdx.x % 32 == 31 ? "\n" : ""); 
    __syncthreads();
    starts[threadIdx.x + blockDim.x *blockIdx.x] = start;
    stops[threadIdx.x + blockDim.x *blockIdx.x] = stop;
    clocks[threadIdx.x + blockDim.x *blockIdx.x] = duration;
}

int main()
{
    cudaDeviceSetLimit(cudaLimitStackSize, 64 * 1024);
    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 64*1024);
    cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
    kernel<<<1, pitch, 2 * size * pitch * sizeof(T)>>>();
    cudaDeviceSynchronize();    

    auto min_clock = *std::min_element(std::begin(clocks), std::end(clocks));
    auto min_start = *std::min_element(std::begin(starts), std::end(starts));
    auto max_stop = *std::max_element(std::begin(stops), std::end(stops));
    auto avg_clock = std::accumulate(std::begin(clocks), std::end(clocks), 0) / (float)(std::end(clocks) - std::begin(clocks));
    auto avg_clock_per_access = avg_clock / (float)(warps * size * 2);
    printf("min = %lli\n", min_clock);
    printf("max = %lli\n", max_stop - min_start);
    printf("avg = %f\n", avg_clock_per_access);
}

It measures in clock cycles the duration of the shared memory copy for each thread and outputs the avg number of cycles taken by a shared memory access. I get 2 cycles and it should be 1 cycle. I don’t understand what I’m doing wrong.

Note: I have tried with float4 and I got the same results. You can change the type T to float4 and reduce size to 8 so that you don’t use more than 64kb of shared memory.

Leave a Comment