diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -4902,8 +4902,7 @@ static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) { - __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); - return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); + return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); } /// Loads two 128-bit floating-point vectors of [2 x double] from @@ -4930,8 +4929,7 @@ static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) { - __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); - return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); + return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); } /// Loads two 128-bit integer vectors from unaligned memory locations and @@ -4955,8 +4953,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) { - __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); - return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); + return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo)); } /* SIMD store ops (unaligned) */ diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1233,30 +1233,24 @@ __m256 test_mm256_loadu2_m128(float* A, float* B) { // CHECK-LABEL: test_mm256_loadu2_m128 // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> return _mm256_loadu2_m128(A, B); } __m256d test_mm256_loadu2_m128d(double* A, double* B) { // CHECK-LABEL: test_mm256_loadu2_m128d // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> return _mm256_loadu2_m128d(A, B); } __m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) { // CHECK-LABEL: test_mm256_loadu2_m128i // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}} - // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> - // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> return _mm256_loadu2_m128i(A, B); }