diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -128,14 +128,15 @@ }; // `vpermilps r, i` -> `vshufps r, r, i` - // `vshufps` is always as fast or faster than - // `vpermilps` and takes 1 less byte of code size. + // `vpermilps r, i, k` -> `vshufps r, r, i, k` + // `vshufps` is always as fast or faster than `vpermilps` and takes + // 1 less byte of code size for VEX and SSE encoding. auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { if (!NewOpcPreferable(NewOpc)) return false; unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); MI.removeOperand(NumOperands - 1); - MI.addOperand(MI.getOperand(1)); + MI.addOperand(MI.getOperand(NumOperands - 2)); MI.setDesc(TII->get(NewOpc)); MI.addOperand(MachineOperand::CreateImm(MaskImm)); return true; @@ -156,6 +157,8 @@ // `vunpcklpd/vmovlhps r, r` -> `vshufps r, r, 0x44` // `vunpckhpd/vmovlhps r, r` -> `vshufps r, r, 0xee` + // `vunpcklpd r, r, k` -> `vshufps r, r, 0x44, k` + // `vunpckhpd r, r, k` -> `vshufps r, r, 0xee, k` // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with // `vunpck{l|h}pd` as it uses less code size. // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS` @@ -175,7 +178,6 @@ return ProcessUNPCKPD(NewOpc, 0xee); }; - // TODO: Add masked predicate execution variants. switch (Opc) { case X86::VPERMILPSri: return ProcessVPERMILPSri(X86::VSHUFPSrri); @@ -187,6 +189,18 @@ return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); case X86::VPERMILPSZri: return ProcessVPERMILPSri(X86::VSHUFPSZrri); + case X86::VPERMILPSZ128rikz: + return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); + case X86::VPERMILPSZ256rikz: + return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); + case X86::VPERMILPSZrikz: + return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); + case X86::VPERMILPSZ128rik: + return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); + case X86::VPERMILPSZ256rik: + return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); + case X86::VPERMILPSZrik: + return ProcessVPERMILPSri(X86::VSHUFPSZrrik); case X86::VPERMILPSmi: return ProcessVPERMILPSmi(X86::VPSHUFDmi); case X86::VPERMILPSYmi: @@ -199,6 +213,18 @@ return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); case X86::VPERMILPSZmi: return ProcessVPERMILPSmi(X86::VPSHUFDZmi); + case X86::VPERMILPSZ128mikz: + return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); + case X86::VPERMILPSZ256mikz: + return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); + case X86::VPERMILPSZmikz: + return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); + case X86::VPERMILPSZ128mik: + return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); + case X86::VPERMILPSZ256mik: + return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); + case X86::VPERMILPSZmik: + return ProcessVPERMILPSmi(X86::VPSHUFDZmik); // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as @@ -222,6 +248,18 @@ return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rri); case X86::VUNPCKLPDZrr: return ProcessUNPCKLPDrr(X86::VSHUFPSZrri); + case X86::VUNPCKLPDZ128rrk: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ128rrik); + case X86::VUNPCKLPDZ256rrk: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rrik); + case X86::VUNPCKLPDZrrk: + return ProcessUNPCKLPDrr(X86::VSHUFPSZrrik); + case X86::VUNPCKLPDZ128rrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ128rrikz); + case X86::VUNPCKLPDZ256rrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rrikz); + case X86::VUNPCKLPDZrrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPSZrrikz); case X86::UNPCKHPDrr: return ProcessUNPCKHPDrr(X86::SHUFPSrri); case X86::VUNPCKHPDrr: @@ -234,6 +272,18 @@ return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rri); case X86::VUNPCKHPDZrr: return ProcessUNPCKHPDrr(X86::VSHUFPSZrri); + case X86::VUNPCKHPDZ128rrk: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ128rrik); + case X86::VUNPCKHPDZ256rrk: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rrik); + case X86::VUNPCKHPDZrrk: + return ProcessUNPCKHPDrr(X86::VSHUFPSZrrik); + case X86::VUNPCKHPDZ128rrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ128rrikz); + case X86::VUNPCKHPDZ256rrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rrikz); + case X86::VUNPCKHPDZrrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPSZrrikz); default: return false; } diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -998,13 +998,13 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_permute_ps: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; X64-NEXT: vshufps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X64-NEXT: retq %arg1 = bitcast i16 %a1 to <16 x i1> %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> @@ -1017,13 +1017,13 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_permute_ps: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; X64-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X64-NEXT: retq %arg0 = bitcast i16 %a0 to <16 x i1> %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1033,7 +1033,7 @@ ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] +; X86-NEXT: vshufps $22, %zmm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] @@ -1041,7 +1041,7 @@ ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] +; X64-NEXT: vshufps $22, %zmm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] @@ -1053,14 +1053,14 @@ ; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] +; X86-NEXT: vshufps $22, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0xc6,0xc0,0x16] ; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] +; X64-NEXT: vshufps $22, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0xc6,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -16,7 +16,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -30,7 +30,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer @@ -42,7 +42,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -56,7 +56,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2] ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer @@ -68,7 +68,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -82,7 +82,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer @@ -102,7 +102,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -116,7 +116,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2] ; CHECK-NEXT: retq %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer @@ -292,7 +292,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6] ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -306,7 +306,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -352,7 +352,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4] ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -366,7 +366,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -404,7 +404,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7] ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -418,7 +418,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -464,7 +464,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5] ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -478,7 +478,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -785,7 +785,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13] +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -799,7 +799,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13] ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = fcmp oeq <16 x float> %mask, zeroinitializer @@ -845,7 +845,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -859,7 +859,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = fcmp oeq <16 x float> %mask, zeroinitializer @@ -897,7 +897,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -911,7 +911,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12] ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = fcmp oeq <16 x float> %mask, zeroinitializer @@ -957,7 +957,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -971,7 +971,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14] ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = fcmp oeq <16 x float> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -903,7 +903,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] +; X86-NEXT: vshufps $22, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xc8,0x16] ; X86-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] ; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -911,7 +911,7 @@ ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] +; X64-NEXT: vshufps $22, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xc8,0x16] ; X64-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] ; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] @@ -924,14 +924,14 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] +; X86-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0xc6,0xc0,0x16] ; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] +; X64-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0xc6,0xc0,0x16] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) @@ -955,7 +955,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] +; X86-NEXT: vshufps $22, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xc8,0x16] ; X86-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] ; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -963,7 +963,7 @@ ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] +; X64-NEXT: vshufps $22, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xc8,0x16] ; X64-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] ; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] @@ -976,14 +976,14 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] +; X86-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0xc6,0xc0,0x16] ; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] +; X64-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0xc6,0xc0,0x16] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll @@ -35,7 +35,7 @@ ; CHECK-LABEL: transform_VPERMILPSZrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: retq %mask = bitcast i16 %mask_int to <16 x i1> %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> @@ -47,7 +47,7 @@ ; CHECK-LABEL: transform_VPERMILPSYrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> %shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> @@ -59,7 +59,7 @@ ; CHECK-LABEL: transform_VPERMILPSrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2,1,0] ; CHECK-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> @@ -71,7 +71,7 @@ ; CHECK-LABEL: transform_VPERMILPSZrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = bitcast i16 %mask_int to <16 x i1> @@ -84,7 +84,7 @@ ; CHECK-LABEL: transform_VPERMILPSYrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> @@ -97,7 +97,7 @@ ; CHECK-LABEL: transform_VPERMILPSrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[3,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2,1,0] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> @@ -182,11 +182,29 @@ } define <16 x float> @transform_VPERMILPSZrmkz(ptr %ap, i16 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSZrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSZrmkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSZrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSZrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i16 %mask_int to <16 x i1> %a = load <16 x float>, ptr %ap %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> @@ -195,11 +213,29 @@ } define <8 x float> @transform_VPERMILPSYrmkz(ptr %ap, i8 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSYrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSYrmkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSYrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSYrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSYrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> %a = load <8 x float>, ptr %ap %shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> @@ -208,11 +244,29 @@ } define <4 x float> @transform_VPERMILPSrmkz(ptr %ap, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSrmkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %a = load <4 x float>, ptr %ap %shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> @@ -221,11 +275,29 @@ } define <16 x float> @transform_VPERMILPSZrmk(ptr %ap, <16 x float> %b, i16 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSZrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSZrmk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSZrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSZrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i16 %mask_int to <16 x i1> %a = load <16 x float>, ptr %ap %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> @@ -234,11 +306,29 @@ } define <8 x float> @transform_VPERMILPSYrmk(ptr %ap, <8 x float> %b, i8 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSYrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSYrmk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSYrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSYrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSYrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> %a = load <8 x float>, ptr %ap %shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> @@ -247,11 +337,29 @@ } define <4 x float> @transform_VPERMILPSrmk(ptr %ap, <4 x float> %b, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VPERMILPSrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0] -; CHECK-NEXT: retq +; CHECK-ICX-LABEL: transform_VPERMILPSrmk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[3,2,1,0] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VPERMILPSrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VPERMILPSrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VPERMILPSrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %a = load <4 x float>, ptr %ap %shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll @@ -165,11 +165,29 @@ } define <4 x double> @transform_VUNPCKLPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer @@ -177,11 +195,29 @@ } define <4 x double> @transform_VUNPCKHPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer @@ -189,11 +225,29 @@ } define <2 x double> @transform_VUNPCKLPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,1] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer @@ -201,11 +255,29 @@ } define <2 x double> @transform_VUNPCKHPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[2,3] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer @@ -239,12 +311,33 @@ } define <4 x double> @transform_VUNPCKLPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5] +; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c @@ -252,12 +345,33 @@ } define <4 x double> @transform_VUNPCKHPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7] +; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c @@ -265,12 +379,33 @@ } define <2 x double> @transform_VUNPCKLPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[0,1],xmm1[0,1] +; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c @@ -278,12 +413,33 @@ } define <2 x double> @transform_VUNPCKHPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[2,3] +; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -362,19 +362,19 @@ ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; X64-AVX512BW-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) ret <16 x float> %res0 @@ -543,19 +543,19 @@ ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-AVX512BW-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) ret <16 x float> %res0