diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15445,6 +15445,11 @@ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + if (Subtarget.hasSSE41()) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { @@ -15498,10 +15503,6 @@ return V; if (Subtarget.hasSSE41()) { - if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Blend; - // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; @@ -19082,6 +19083,10 @@ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -4,14 +4,23 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind { -; CHECK-LABEL: test1: -; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vbroadcastss %xmm1, %zmm1 -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15] -; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test1: +; KNL: ## %bb.0: +; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; KNL-NEXT: movw $16384, %ax ## imm = 0x4000 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test1: +; SKX: ## %bb.0: +; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; SKX-NEXT: movw $16384, %ax ## imm = 0x4000 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load float, ptr %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -305,19 +305,15 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # %bb.0: -; ALL-NEXT: vmovups (%rdi), %zmm1 -; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 +; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups (%eax), %zmm1 -; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 +; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3 %ptrC = getelementptr inbounds float, ptr %ptr, i64 12