Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -107,6 +107,51 @@ } } +#define CASE_VSHUF64x2_FAMILY(Inst, Suffix, src2) \ + case X86::VSHUFF##Inst##Suffix##r##src2##i: \ + case X86::VSHUFF##Inst##Suffix##r##src2##ik: \ + case X86::VSHUFF##Inst##Suffix##r##src2##ikz: \ + case X86::VSHUFI##Inst##Suffix##r##src2##i: \ + case X86::VSHUFI##Inst##Suffix##r##src2##ik: \ + case X86::VSHUFI##Inst##Suffix##r##src2##ikz: + +#define CASE_ALL_VSHUF64x2_FAMILY(Inst) \ + CASE_VSHUF64x2_FAMILY(Inst, Z, r) \ + CASE_VSHUF64x2_FAMILY(Inst, Z, m) \ + CASE_VSHUF64x2_FAMILY(Inst, Z256, r) \ + CASE_VSHUF64x2_FAMILY(Inst, Z256, m) \ + +/// \brief Extracts the types and if it has memory operand for a given +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. +static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) { + HasMemOp = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown VSHUF64x2 family instructions."); + break; + CASE_VSHUF64x2_FAMILY(64X2, Z, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF64x2_FAMILY(64X2, Z, r) + VT = MVT::v8i64; + break; + CASE_VSHUF64x2_FAMILY(64X2, Z256, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF64x2_FAMILY(64X2, Z256, r) + VT = MVT::v4i64; + break; + CASE_VSHUF64x2_FAMILY(32X4, Z, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF64x2_FAMILY(32X4, Z, r) + VT = MVT::v16i32; + break; + CASE_VSHUF64x2_FAMILY(32X4, Z256, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF64x2_FAMILY(32X4, Z256, r) + VT = MVT::v8i32; + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -726,7 +771,25 @@ Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - + CASE_ALL_VSHUF64x2_FAMILY(64X2) + CASE_ALL_VSHUF64x2_FAMILY(32X4) { + MVT VT; + bool HasMemOp; + unsigned NumOp = MI->getNumOperands(); + getVSHUF64x2FamilyInfo(MI, VT, HasMemOp); + decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + if (HasMemOp) { + assert((NumOp >= 8) && "Expected at least 8 operands!"); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + } else { + assert((NumOp >= 4) && "Expected at least 4 operands!"); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + } + break; + } case X86::UNPCKLPDrr: case X86::VUNPCKLPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Index: lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.h +++ lib/Target/X86/Utils/X86ShuffleDecode.h @@ -86,6 +86,11 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask); + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask); Index: lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -264,6 +264,26 @@ } } +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getVectorElementType().getSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements() / 2; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10713,6 +10713,42 @@ } } +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getVectorElementType().getSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.getSizeInBits() == 512 && + "Unexpected vector size for 128bit shuffle."); + + SmallVector WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef musk. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -10745,6 +10781,10 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; @@ -10781,6 +10821,10 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4162,7 +4162,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) @@ -4179,8 +4181,11 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -4200,7 +4205,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) @@ -4217,7 +4224,9 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1867,7 +1867,7 @@ define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) { ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128 - ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07] + ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2299,7 +2299,7 @@ define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_add_ps_256 - ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2321,7 +2321,7 @@ define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_add_ps_128 - ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2343,7 +2343,7 @@ define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_sub_ps_256 - ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2365,7 +2365,7 @@ define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_sub_ps_128 - ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2387,7 +2387,7 @@ define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_mul_ps_256 - ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2409,7 +2409,7 @@ define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_mul_ps_128 - ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2431,7 +2431,7 @@ define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_div_ps_256 - ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2453,7 +2453,7 @@ define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_div_ps_128 - ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2475,7 +2475,7 @@ define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_max_ps_256 - ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2497,7 +2497,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_max_ps_128 - ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2519,7 +2519,7 @@ define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_min_ps_256 - ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1} + ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1} %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2541,7 +2541,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) { ;CHECK-LABEL: test_mm512_mask_min_ps_128 - ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1} + ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1} %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -2591,9 +2591,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxsd %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) { %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) @@ -2604,9 +2604,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxsd %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2617,9 +2617,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxsq %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2630,9 +2630,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxsq %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) { %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) @@ -2643,9 +2643,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxud %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) { %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) @@ -2656,9 +2656,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxud %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2669,9 +2669,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxuq %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2682,9 +2682,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpmaxuq %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) { %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) @@ -2695,9 +2695,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminsd %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) { %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) @@ -2708,9 +2708,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminsd %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2721,9 +2721,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminsq %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2734,9 +2734,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminsq %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) { %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) @@ -2747,9 +2747,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminud %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) { %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) @@ -2760,9 +2760,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminud %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2773,9 +2773,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminuq %xmm -; CHECK: {%k1} +; CHECK: {%k1} define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2786,9 +2786,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256 -; CHECK-NOT: call +; CHECK-NOT: call ; CHECK: vpminuq %ymm -; CHECK: {%k1} +; CHECK: {%k1} define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) { %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) @@ -2799,8 +2799,8 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128 -; CHECK-NOT: call -; CHECK: kmov +; CHECK-NOT: call +; CHECK: kmov ; CHECK: vpermt2d %xmm{{.*}}{%k1} ; CHECK-NOT: {z} define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { @@ -2813,8 +2813,8 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128 -; CHECK-NOT: call -; CHECK: kmov +; CHECK-NOT: call +; CHECK: kmov ; CHECK: vpermt2d %xmm{{.*}}{%k1} {z} define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) @@ -2826,8 +2826,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256 -; CHECK-NOT: call -; CHECK: kmov +; CHECK-NOT: call +; CHECK: kmov ; CHECK: vpermt2d %ymm{{.*}}{%k1} ; CHECK-NOT: {z} define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { @@ -2840,8 +2840,8 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256 -; CHECK-NOT: call -; CHECK: kmov +; CHECK-NOT: call +; CHECK: kmov ; CHECK: vpermt2d {{.*}}{%k1} {z} define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) @@ -2853,9 +2853,9 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermi2pd %xmm{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2pd %xmm{{.*}}{%k1} define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) @@ -2866,9 +2866,9 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermi2pd %ymm{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2pd %ymm{{.*}}{%k1} define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) @@ -2879,9 +2879,9 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermi2ps %xmm{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2ps %xmm{{.*}}{%k1} define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) @@ -2892,9 +2892,9 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermi2ps %ymm{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2ps %ymm{{.*}}{%k1} define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) @@ -2905,9 +2905,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsq{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsq{{.*}}{%k1} define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) @@ -2918,9 +2918,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsq{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsq{{.*}}{%k1} define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) @@ -2931,9 +2931,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsd{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsd{{.*}}{%k1} define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -2944,9 +2944,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsd{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsd{{.*}}{%k1} define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) @@ -2958,9 +2958,9 @@ declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vscalefpd{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefpd{{.*}}{%k1} define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) @@ -2971,9 +2971,9 @@ declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vscalefpd{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefpd{{.*}}{%k1} define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) @@ -2983,9 +2983,9 @@ declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vscalefps{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefps{{.*}}{%k1} define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) @@ -2995,9 +2995,9 @@ declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vscalefps{{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefps{{.*}}{%k1} define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) @@ -3009,7 +3009,7 @@ define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: -; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) @@ -3046,7 +3046,7 @@ define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; CHECK: ## BB#0: -; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} +; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} ; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) @@ -4457,9 +4457,9 @@ declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vrndscalepd {{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vrndscalepd {{.*}}{%k1} ; CHECK: vrndscalepd define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) @@ -4470,9 +4470,9 @@ declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vrndscalepd {{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vrndscalepd {{.*}}{%k1} ; CHECK: vrndscalepd define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) @@ -4483,9 +4483,9 @@ declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vrndscaleps {{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vrndscaleps {{.*}}{%k1} ; CHECK: vrndscaleps define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3) @@ -4497,9 +4497,9 @@ declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8) ; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vrndscaleps {{.*}}{%k1} +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vrndscaleps {{.*}}{%k1} ; CHECK: vrndscaleps define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3) @@ -4516,13 +4516,20 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 } declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) @@ -4533,13 +4540,20 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: ## ymm3 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 } declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) @@ -4550,7 +4564,9 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) @@ -4567,7 +4583,9 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -88,8 +88,7 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -667,8 +666,7 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5] -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1186,3 +1184,85 @@ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } + +define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind { +; ALL-LABEL: test_vshuff64x2_512: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 $24, %zmm1, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_maskz: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm2, %zmm2 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; ALL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; ALL-NEXT: vshuff64x2 $24, %zmm1, %zmm0, %zmm0 {%k1} {z} # zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshufi64x2_512_mask: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm2, %zmm2 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; ALL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; ALL-NEXT: vshufi64x2 $24, %zmm1, %zmm0, %zmm0 {%k1} # zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x + ret <8 x i64> %res +} + +define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 $24, (%rdi), %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem_mask: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm1, %zmm1 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; ALL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; ALL-NEXT: vshuff64x2 $24, (%rdi), %zmm0, %zmm0 {%k1} # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> %x + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem_maskz: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm1, %zmm1 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; ALL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; ALL-NEXT: vshuff64x2 $24, (%rdi), %zmm0, %zmm0 {%k1} {z} # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind { +; ALL-LABEL: test_vshuff32x4_512: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 $20, %zmm1, %zmm0, %zmm0 # zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] +; ALL-NEXT: retq + %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> + ret <16 x float> %res +} \ No newline at end of file Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -213,8 +213,7 @@ ; AVX512F-NEXT: movzbl %dil, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -224,8 +223,7 @@ ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kmovb %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> -; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq