Why can’t clang vectorise this loop over a std::span, writing results to a std::array?

Why won’t clang 17.0.1 vectorise the loop in the following function:

void adapt(std::span<const F, N + 1> signal)
{
    F true_val = signal.back();
    F y = dot_prod<F, N>(&signal[0], w.begin());
    F e = true_val - y;
    F dot = dot_prod<F, N>(&signal[0], &signal[0]);
    F nu = mu / (eps + dot);
    for (size_t i = 0; i < N; i++)
    {
        w[i] += nu * e * signal[i];
    }
}

There is no carried dependency or issues with floating point associativity and GCC 13.2 has no trouble vectorising it.

Here’s a link to the full code on compiler explorer.

The context of this is that I am trying to optimise my code to use a vectorised dot product. Usually std::inner_product will emit a scalar implementation due to the floating point associativity problem unless you use -ffast-math. However, I only want the -ffast-math to apply to a single function, and I was looking for a portable way to do this for both clang and GCC. While looking at the output, I noticed that clang doesn’t vectorise this other loop.

Here is the full C++:

#include <cstddef>
#include <span>
#include <numeric>
#include <cassert>

#pragma float_control(precise, off, push)
template <typename F, size_t N> 
__attribute__((optimize("-ffast-math"))) 
constexpr inline F dot_prod(const F *a, const F *b)
{
    F acc = 0.0f;
    for(size_t i = 0; i < N; i++)
        acc += a[i] * b[i];
    return acc;
}
#pragma float_control(pop)

template <typename F, size_t N> class NlmsFilter
{
    static constexpr F mu = 0.5;
    static constexpr F eps = 1.0;
    std::array<F, N> w = {};

  public:
    F predict(std::span<const F> signal)
    {
        assert(signal.size() <= N + 1);
        if (signal.size() == N + 1)
        {
            auto signal_static = signal.template subspan<0, N + 1>();
            adapt(signal_static);
            return dot_prod<F, N>(&signal_static[1], w.begin());
        }
        else if (signal.size() == 0)
        {
            return 0.0f;
        }
        else
        {
            return signal.back();
        }
    }
    void adapt(std::span<const F, N + 1> signal)
    {
        F true_val = signal.back();
        F y = dot_prod<F, N>(&signal[0], w.begin());
        F e = true_val - y;
        F dot = dot_prod<F, N>(&signal[0], &signal[0]);
        F nu = mu / (eps + dot);
        for (size_t i = 0; i < N; i++)
        {
            w[i] += nu * e * signal[i];
        }
    }
};

template class NlmsFilter<float, 32>;

Here is the assembly output from clang:

NlmsFilter<float, 32ul>::predict(std::span<float const, 18446744073709551615ul>): # @NlmsFilter<float, 32ul>::predict(std::span<float const, 18446744073709551615ul>)
        push    r14
        push    rbx
        push    rax
        cmp     rdx, 34
        jae     .LBB0_6
        test    rdx, rdx
        je      .LBB0_2
        mov     rbx, rsi
        cmp     rdx, 33
        jne     .LBB0_4
        mov     r14, rdi
        mov     rsi, rbx
        call    NlmsFilter<float, 32ul>::adapt(std::span<float const, 33ul>)
        add     rbx, 4
        mov     rdi, rbx
        mov     rsi, r14
        add     rsp, 8
        pop     rbx
        pop     r14
        jmp     float dot_prod<float, 32ul>(float const*, float const*)   # TAILCALL
.LBB0_2:
        vxorps  xmm0, xmm0, xmm0
        add     rsp, 8
        pop     rbx
        pop     r14
        ret
.LBB0_4:
        vmovss  xmm0, dword ptr [rbx + 4*rdx - 4] # xmm0 = mem[0],zero,zero,zero
        add     rsp, 8
        pop     rbx
        pop     r14
        ret
.LBB0_6:
        mov     edi, offset .L.str
        mov     esi, offset .L.str.1
        mov     ecx, offset .L__PRETTY_FUNCTION__.NlmsFilter<float, 32ul>::predict(std::span<float const, 18446744073709551615ul>)
        mov     edx, 27
        call    __assert_fail
.LCPI1_0:
        .long   0x3f800000                      # float 1
.LCPI1_1:
        .long   0x3f000000                      # float 0.5
NlmsFilter<float, 32ul>::adapt(std::span<float const, 33ul>): # @NlmsFilter<float, 32ul>::adapt(std::span<float const, 33ul>)
        push    r14
        push    rbx
        push    rax
        mov     r14, rsi
        mov     rbx, rdi
        vmovss  xmm0, dword ptr [rsi + 128]     # xmm0 = mem[0],zero,zero,zero
        vmovss  dword ptr [rsp + 4], xmm0       # 4-byte Spill
        mov     rdi, rsi
        mov     rsi, rbx
        call    float dot_prod<float, 32ul>(float const*, float const*)
        vmovss  xmm1, dword ptr [rsp + 4]       # 4-byte Reload
        vsubss  xmm0, xmm1, xmm0
        vmovss  dword ptr [rsp + 4], xmm0       # 4-byte Spill
        mov     rdi, r14
        mov     rsi, r14
        call    float dot_prod<float, 32ul>(float const*, float const*)
        vaddss  xmm0, xmm0, dword ptr [rip + .LCPI1_0]
        vmovss  xmm1, dword ptr [rip + .LCPI1_1] # xmm1 = mem[0],zero,zero,zero
        vdivss  xmm0, xmm1, xmm0
        vmulss  xmm0, xmm0, dword ptr [rsp + 4] # 4-byte Folded Reload
        vmulss  xmm1, xmm0, dword ptr [r14]
        vaddss  xmm1, xmm1, dword ptr [rbx]
        vmovss  dword ptr [rbx], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 4]
        vaddss  xmm1, xmm1, dword ptr [rbx + 4]
        vmovss  dword ptr [rbx + 4], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 8]
        vaddss  xmm1, xmm1, dword ptr [rbx + 8]
        vmovss  dword ptr [rbx + 8], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 12]
        vaddss  xmm1, xmm1, dword ptr [rbx + 12]
        vmovss  dword ptr [rbx + 12], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 16]
        vaddss  xmm1, xmm1, dword ptr [rbx + 16]
        vmovss  dword ptr [rbx + 16], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 20]
        vaddss  xmm1, xmm1, dword ptr [rbx + 20]
        vmovss  dword ptr [rbx + 20], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 24]
        vaddss  xmm1, xmm1, dword ptr [rbx + 24]
        vmovss  dword ptr [rbx + 24], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 28]
        vaddss  xmm1, xmm1, dword ptr [rbx + 28]
        vmovss  dword ptr [rbx + 28], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 32]
        vaddss  xmm1, xmm1, dword ptr [rbx + 32]
        vmovss  dword ptr [rbx + 32], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 36]
        vaddss  xmm1, xmm1, dword ptr [rbx + 36]
        vmovss  dword ptr [rbx + 36], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 40]
        vaddss  xmm1, xmm1, dword ptr [rbx + 40]
        vmovss  dword ptr [rbx + 40], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 44]
        vaddss  xmm1, xmm1, dword ptr [rbx + 44]
        vmovss  dword ptr [rbx + 44], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 48]
        vaddss  xmm1, xmm1, dword ptr [rbx + 48]
        vmovss  dword ptr [rbx + 48], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 52]
        vaddss  xmm1, xmm1, dword ptr [rbx + 52]
        vmovss  dword ptr [rbx + 52], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 56]
        vaddss  xmm1, xmm1, dword ptr [rbx + 56]
        vmovss  dword ptr [rbx + 56], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 60]
        vaddss  xmm1, xmm1, dword ptr [rbx + 60]
        vmovss  dword ptr [rbx + 60], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 64]
        vaddss  xmm1, xmm1, dword ptr [rbx + 64]
        vmovss  dword ptr [rbx + 64], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 68]
        vaddss  xmm1, xmm1, dword ptr [rbx + 68]
        vmovss  dword ptr [rbx + 68], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 72]
        vaddss  xmm1, xmm1, dword ptr [rbx + 72]
        vmovss  dword ptr [rbx + 72], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 76]
        vaddss  xmm1, xmm1, dword ptr [rbx + 76]
        vmovss  dword ptr [rbx + 76], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 80]
        vaddss  xmm1, xmm1, dword ptr [rbx + 80]
        vmovss  dword ptr [rbx + 80], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 84]
        vaddss  xmm1, xmm1, dword ptr [rbx + 84]
        vmovss  dword ptr [rbx + 84], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 88]
        vaddss  xmm1, xmm1, dword ptr [rbx + 88]
        vmovss  dword ptr [rbx + 88], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 92]
        vaddss  xmm1, xmm1, dword ptr [rbx + 92]
        vmovss  dword ptr [rbx + 92], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 96]
        vaddss  xmm1, xmm1, dword ptr [rbx + 96]
        vmovss  dword ptr [rbx + 96], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 100]
        vaddss  xmm1, xmm1, dword ptr [rbx + 100]
        vmovss  dword ptr [rbx + 100], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 104]
        vaddss  xmm1, xmm1, dword ptr [rbx + 104]
        vmovss  dword ptr [rbx + 104], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 108]
        vaddss  xmm1, xmm1, dword ptr [rbx + 108]
        vmovss  dword ptr [rbx + 108], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 112]
        vaddss  xmm1, xmm1, dword ptr [rbx + 112]
        vmovss  dword ptr [rbx + 112], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 116]
        vaddss  xmm1, xmm1, dword ptr [rbx + 116]
        vmovss  dword ptr [rbx + 116], xmm1
        vmulss  xmm1, xmm0, dword ptr [r14 + 120]
        vaddss  xmm1, xmm1, dword ptr [rbx + 120]
        vmovss  dword ptr [rbx + 120], xmm1
        vmulss  xmm0, xmm0, dword ptr [r14 + 124]
        vaddss  xmm0, xmm0, dword ptr [rbx + 124]
        vmovss  dword ptr [rbx + 124], xmm0
        add     rsp, 8
        pop     rbx
        pop     r14
        ret
float dot_prod<float, 32ul>(float const*, float const*):          # @float dot_prod<float, 32ul>(float const*, float const*)
        vmovups ymm0, ymmword ptr [rsi]
        vmovups ymm1, ymmword ptr [rsi + 32]
        vmovups ymm2, ymmword ptr [rsi + 64]
        vmulps  ymm2, ymm2, ymmword ptr [rdi + 64]
        vmovups ymm3, ymmword ptr [rsi + 96]
        vmulps  ymm0, ymm0, ymmword ptr [rdi]
        vaddps  ymm0, ymm0, ymm2
        vmulps  ymm2, ymm3, ymmword ptr [rdi + 96]
        vmulps  ymm1, ymm1, ymmword ptr [rdi + 32]
        vaddps  ymm1, ymm1, ymm2
        vaddps  ymm0, ymm0, ymm1
        vextractf128    xmm1, ymm0, 1
        vaddps  xmm0, xmm0, xmm1
        vpermilpd       xmm1, xmm0, 1           # xmm1 = xmm0[1,0]
        vaddps  xmm0, xmm0, xmm1
        vmovshdup       xmm1, xmm0              # xmm1 = xmm0[1,1,3,3]
        vaddss  xmm0, xmm0, xmm1
        vzeroupper
        ret
NlmsFilter<float, 32ul>::mu:
        .long   0x3f000000                      # float 0.5

NlmsFilter<float, 32ul>::eps:
        .long   0x3f800000                      # float 1

.L.str:
        .asciz  "signal.size() <= N + 1"

.L.str.1:
        .asciz  "/app/example.cpp"

.L__PRETTY_FUNCTION__.NlmsFilter<float, 32ul>::predict(std::span<float const, 18446744073709551615ul>):
        .asciz  "F NlmsFilter<float, 32>::predict(std::span<const F>) [F = float, N = 32]"

  • 1

    Unless my eyes deceive me it looks like clang does vectorize the code, it also unrolls the loop while doing so.

    – 




  • 2

    clang is using vmulss and vmovss while gcc uses vmups and vmovps, the ss suffix stands for scalar single and the ps stands for packed single. while clang does unroll the loop, it did not effectively use vector instructions

    – 

  • 1

    While the link is good to have, a question must be self contained, and for this case that means you should include the relevant assembly.

    – 




  • 2

    The dot_prod gets vectorized both by clang and gcc. For the w[i]=...-loop gcc checks if signal overlaps with w and if it does not, it vectorizes that loop as well, otherwise it uses a scalar loop, while clang always uses an unrolled scalar loop for that part.

    – 

  • 1

    Another thing missing for it being self-contained is types of the variables. Also, use Godbolt full-links so link-rot isn’t a total showstopper, that’s what full links are for. How are we supposed to post godbolt links now that URL shortening is blocked? (@PasserBy)

    – 

The problem is that clang can’t be sure whether w (partially) aliases with signal (or the data signal points to).

GCC in this case compares the w.data() and signal.data() pointers. If they differ enough, it vectorizes the loop, otherwise it falls back to a scalar loop. Clang probably thinks that the cost of the additional test is not justified by the potential performance gain (it does at some point, if you increase the array size).

You can tell GCC/Clang that pointed-to memory will not overlap using the __restrict keyword (this is non-standard C++. C does provide a restrict keyword). However, this can only be applied to pointers not directly to a std::span (as far as I know). You can work-around this by providing a helper function which takes its output address as a __restrict-ed plain pointer, e.g.:

template<typename F, size_t N>
inline void scale(F * __restrict out, F const* in, F scale)
{
    for(size_t i=0; i < N; ++i)
    {
        out[i] = scale * in[i];
    }
}

And instead of your loop, call it like:

scale<F, N>(w.data(), signal.data(), nu*e);

Modified godbolt-Link: https://godbolt.org/z/WWbdz489e

Leave a Comment