With this patch the following code is able to be correctly optimized.
#include <x86intrin.h> #include <immintrin.h> #include <avxintrin.h> #include <avx2intrin.h> #include <cstdio> #include <cinttypes> __m128 bss4( const __m128 *ptr, size_t i, size_t j ) { float f = ptr[i][j]; return (__m128) { f, f, f, f }; }
Previously an unneeded trunc + zext would be emitted.