Index: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -482,8 +482,9 @@ void DecodeVPERMV3Mask(ArrayRef RawMask, SmallVectorImpl &ShuffleMask) { - for (int i = 0, e = RawMask.size(); i < e; ++i) { - uint64_t M = RawMask[i]; + uint64_t EltMaskSize = (RawMask.size() * 2) - 1; + for (auto M : RawMask) { + M &= EltMaskSize; ShuffleMask.push_back((int)M); } } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5138,16 +5138,11 @@ return false; } case X86ISD::VPERMV3: { + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); - SmallVector RawMask; - unsigned MaskLoBits = Log2_64(VT.getVectorNumElements() * 2); - if (getTargetShuffleMaskIndices(MaskNode, MaskLoBits, RawMask)) { - DecodeVPERMV3Mask(RawMask, Mask); - break; - } if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { DecodeVPERMV3Mask(C, VT, Mask); break; @@ -29202,6 +29197,7 @@ case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VPERMV3: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: Index: llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -191,6 +191,7 @@ Type *MaskTy = C->getType(); unsigned NumElements = MaskTy->getVectorNumElements(); if (NumElements == VT.getVectorNumElements()) { + unsigned EltMaskSize = Log2_64(NumElements * 2); for (unsigned i = 0; i < NumElements; ++i) { Constant *COp = C->getAggregateElement(i); if (!COp) { @@ -200,9 +201,9 @@ if (isa(COp)) ShuffleMask.push_back(SM_SentinelUndef); else { - uint64_t Element = cast(COp)->getZExtValue(); - Element &= (1 << NumElements*2) - 1; - ShuffleMask.push_back(Element); + APInt Element = cast(COp)->getValue(); + Element = Element.getLoBits(EltMaskSize); + ShuffleMask.push_back(Element.getZExtValue()); } } } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -13,10 +13,6 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) { ; CHECK-LABEL: combine_vpermt2var_8f64_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] -; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 -1) @@ -26,10 +22,6 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: combine_vpermt2var_8i64_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] -; CHECK-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) @@ -39,10 +31,6 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; CHECK-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %res0, <16 x float> %res0, i16 -1) @@ -52,10 +40,6 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; CHECK-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) @@ -65,10 +49,6 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: combine_vpermt2var_32i16_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] -; CHECK-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 -1) %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %res0, <32 x i16> %res0, i32 -1)