Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -726,7 +726,64 @@ Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - + case X86::VSHUFF64X2Zrri: + case X86::VSHUFF64X2Zrrik: + case X86::VSHUFF64X2Zrrikz: + case X86::VSHUFI64X2Zrri: + case X86::VSHUFI64X2Zrrik: + case X86::VSHUFI64X2Zrrikz: { + unsigned NumOp = MI->getNumOperands(); + assert((NumOp >= 4) && "Expected at least 4 operands!"); + DecodeVSHUF128Mask(MVT::v8i64, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } + case X86::VSHUFF64X2Zrmi: + case X86::VSHUFF64X2Zrmik: + case X86::VSHUFF64X2Zrmikz: + case X86::VSHUFI64X2Zrmi: + case X86::VSHUFI64X2Zrmik: + case X86::VSHUFI64X2Zrmikz: { + unsigned NumOp = MI->getNumOperands(); + assert((NumOp >= 8) && "Expected at least 8 operands!"); + DecodeVSHUF128Mask(MVT::v8i64, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } + case X86::VSHUFF32X4Zrri: + case X86::VSHUFF32X4Zrrik: + case X86::VSHUFF32X4Zrrikz: + case X86::VSHUFI32X4Zrri: + case X86::VSHUFI32X4Zrrik: + case X86::VSHUFI32X4Zrrikz: { + unsigned NumOp = MI->getNumOperands(); + assert((NumOp >= 4) && "Expected at least 4 operands!"); + DecodeVSHUF128Mask(MVT::v16i32, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } + case X86::VSHUFF32X4Zrmi: + case X86::VSHUFF32X4Zrmik: + case X86::VSHUFF32X4Zrmikz: + case X86::VSHUFI32X4Zrmi: + case X86::VSHUFI32X4Zrmik: + case X86::VSHUFI32X4Zrmikz: { + unsigned NumOp = MI->getNumOperands(); + assert((NumOp >= 8) && "Expected at least 8 operands!"); + DecodeVSHUF128Mask(MVT::v16i32, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + } case X86::UNPCKLPDrr: case X86::VUNPCKLPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Index: lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.h +++ lib/Target/X86/Utils/X86ShuffleDecode.h @@ -85,6 +85,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void DecodeVSHUF128Mask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask); /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. Index: lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -264,6 +264,26 @@ } } +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void DecodeVSHUF128Mask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getVectorElementType().getSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements() / 2; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10713,6 +10713,42 @@ } } +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getVectorElementType().getSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.getSizeInBits() == 512 && + "Unexpected vector size for 128bit shuffle."); + + SmallVector WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef musk. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -10745,6 +10781,10 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; @@ -10781,6 +10821,10 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4162,7 +4162,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) @@ -4179,8 +4181,11 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -4200,7 +4205,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) @@ -4217,7 +4224,9 @@ ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -88,8 +88,7 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -667,8 +666,7 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5] -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1186,3 +1184,85 @@ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } + +define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind { +; ALL-LABEL: test_vshuff64x2_512: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_maskz: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm2, %zmm2 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; ALL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshufi64x2_512_mask: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm2, %zmm2 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; ALL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] +; ALL-NEXT: retq + %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x + ret <8 x i64> %res +} + +define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem_mask: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm1, %zmm1 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; ALL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> %x + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind { +; ALL-LABEL: test_vshuff64x2_512_mem_maskz: +; ALL: # BB#0: +; ALL-NEXT: vpmovsxwq %xmm1, %zmm1 +; ALL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; ALL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] +; ALL-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind { +; ALL-LABEL: test_vshuff32x4_512: +; ALL: # BB#0: +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] +; ALL-NEXT: retq + %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> + ret <16 x float> %res +} \ No newline at end of file Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -213,8 +213,7 @@ ; AVX512F-NEXT: movzbl %dil, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -224,8 +223,7 @@ ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kmovb %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> -; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq