With this patch the following code is able to be correctly optimized.
#include <x86intrin.h>
#include <immintrin.h>
#include <avxintrin.h>
#include <avx2intrin.h>
#include <cstdio>
#include <cinttypes>
__m128 bss4( const __m128 *ptr, size_t i, size_t j )
{
    float f = ptr[i][j];
    return (__m128) { f, f, f, f };
}Previously an unneeded trunc + zext would be emitted.