Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -441,6 +441,8 @@ VPSHA, VPSHL, // XOP signed/unsigned integer comparisons VPCOM, VPCOMU, + // XOP packed permute bytes + VPPERM, // Vector multiply packed unsigned doubleword integers PMULUDQ, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -21478,6 +21478,7 @@ case X86ISD::VSRAI: return "X86ISD::VSRAI"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; + case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -251,6 +251,10 @@ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; +def X86vpperm : SDNode<"X86ISD::VPPERM", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; + def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; Index: lib/Target/X86/X86InstrXOP.td =================================================================== --- lib/Target/X86/X86InstrXOP.td +++ lib/Target/X86/X86InstrXOP.td @@ -222,8 +222,47 @@ defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; } +multiclass xop4op opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { + def rr : IXOPi8, + XOP_4V, VEX_I8IMM; + def rm : IXOPi8, + XOP_4V, VEX_I8IMM, VEX_W, MemOp4; + def mr : IXOPi8, + XOP_4V, VEX_I8IMM; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : IXOPi8, XOP_4V, VEX_I8IMM, VEX_W, MemOp4; +} + +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>; +} + // Instruction where either second or third source can be memory -multiclass xop4op opc, string OpcodeStr, Intrinsic Int> { +multiclass xop4op_int opc, string OpcodeStr, Intrinsic Int> { def rr : IXOPi8; - defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; + defm VPCMOV : xop4op_int<0xA2, "vpcmov", int_x86_xop_vpcmov>; } multiclass xop4op256 opc, string OpcodeStr, Intrinsic Int> { Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -2278,6 +2278,7 @@ X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0), X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), Index: lib/Target/X86/X86MCInstLower.cpp =================================================================== --- lib/Target/X86/X86MCInstLower.cpp +++ lib/Target/X86/X86MCInstLower.cpp @@ -1018,7 +1018,8 @@ } static std::string getShuffleComment(const MachineOperand &DstOp, - const MachineOperand &SrcOp, + const MachineOperand &SrcOp1, + const MachineOperand &SrcOp2, ArrayRef Mask) { std::string Comment; @@ -1032,39 +1033,51 @@ }; StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem"; - StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem"; + StringRef Src1Name = + SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem"; + StringRef Src2Name = + SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem"; + + SmallVector ShuffleMask(Mask.begin(), Mask.end()); + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; + } + } raw_string_ostream CS(Comment); CS << DstName << " = "; - bool NeedComma = false; - bool InSrc = false; - for (int M : Mask) { - // Wrap up any prior entry... - if (M == SM_SentinelZero && InSrc) { - InSrc = false; - CS << "]"; - } - if (NeedComma) + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) CS << ","; - else - NeedComma = true; - - // Print this shuffle... - if (M == SM_SentinelZero) { + if (ShuffleMask[i] == SM_SentinelZero) { CS << "zero"; - } else { - if (!InSrc) { - InSrc = true; - CS << SrcName << "["; - } - if (M == SM_SentinelUndef) + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)e; + CS << (isSrc1 ? Src1Name : Src2Name) << '['; + + bool IsFirst = true; + while (i != e && ShuffleMask[i] != SM_SentinelZero && + (ShuffleMask[i] < (int)e) == isSrc1) { + if (!IsFirst) + CS << ','; + else + IsFirst = false; + if (ShuffleMask[i] == SM_SentinelUndef) CS << "u"; else - CS << M; + CS << ShuffleMask[i] % (int)e; + ++i; } + CS << ']'; + --i; // For loop increments element #. } - if (InSrc) - CS << "]"; CS.flush(); return Comment; @@ -1313,7 +1326,7 @@ SmallVector Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask)); } break; } @@ -1340,7 +1353,25 @@ SmallVector Mask; DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask)); + } + break; + } + case X86::VPPERMrm: { + if (!OutStreamer->isVerboseAsm()) + break; + assert(MI->getNumOperands() > 6 && + "We should always have at least 6 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp1 = MI->getOperand(1); + const MachineOperand &SrcOp2 = MI->getOperand(2); + const MachineOperand &MaskOp = MI->getOperand(6); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector Mask; + DecodeVPPERMMask(C, Mask); + if (!Mask.empty()) + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask)); } break; } Index: lib/Target/X86/X86ShuffleDecodeConstantPool.h =================================================================== --- lib/Target/X86/X86ShuffleDecodeConstantPool.h +++ lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -32,6 +32,9 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, SmallVectorImpl &ShuffleMask); +/// \brief Decode a VPPERM variable mask from an IR-level vector constant. +void DecodeVPPERMMask(const Constant *C, SmallVectorImpl &ShuffleMask); + /// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. void DecodeVPERMVMask(const Constant *C, MVT VT, SmallVectorImpl &ShuffleMask); Index: lib/Target/X86/X86ShuffleDecodeConstantPool.cpp =================================================================== --- lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -153,6 +153,74 @@ // TODO: Handle funny-looking vectors too. } +void DecodeVPPERMMask(const Constant *C, SmallVectorImpl &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->getPrimitiveSizeInBits() == 128); + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // The shuffle mask requires a byte vector - decode cases with + // wider elements as well. + unsigned BitWidth = cast(VecEltTy)->getBitWidth(); + if ((BitWidth % 8) != 0) + return; + + int NumElts = MaskTy->getVectorNumElements(); + int Scale = BitWidth / 8; + int NumBytes = NumElts * Scale; + ShuffleMask.reserve(NumBytes); + + for (unsigned i = 0; i != NumElts; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa(COp)) { + ShuffleMask.append(Scale, SM_SentinelUndef); + continue; + } + + // VPPERM Operation + // Bits[4:0] - Byte Index (0 - 31) + // Bits[7:5] - Permute Operation + // + // Permute Operation: + // 0 - Source byte (no logical operation). + // 1 - Invert source byte. + // 2 - Bit reverse of source byte. + // 3 - Bit reverse of inverted source byte. + // 4 - 00h (zero - fill). + // 5 - FFh (ones - fill). + // 6 - Most significant bit of source byte replicated in all bit positions. + // 7 - Invert most significant bit of source byte and replicate in all bit positions. + APInt APElt = cast(COp)->getValue(); + for (int j = 0; j != Scale; ++j) { + APInt Index = APElt.getLoBits(5); + APInt PermuteOp = APElt.lshr(5).getLoBits(3); + APElt = APElt.lshr(8); + + if (PermuteOp == 4) { + ShuffleMask.push_back(SM_SentinelZero); + continue; + } + if (PermuteOp != 0) { + ShuffleMask.clear(); + return; + } + ShuffleMask.push_back((int)Index.getZExtValue()); + } + } + + assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size"); +} + void DecodeVPERMVMask(const Constant *C, MVT VT, SmallVectorImpl &ShuffleMask) { Type *MaskTy = C->getType(); Index: test/CodeGen/X86/vector-shuffle-combining-xop.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -13,19 +13,28 @@ define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vpperm {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> ) ret <16 x i8> %res1 } +define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: combine_vpperm_as_unary_unpckhwd: +; CHECK: # BB#0: +; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: retq + %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> ) + ret <16 x i8> %res0 +} + define <16 x i8> @combine_vpperm_as_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_as_unpckhwd: ; CHECK: # BB#0: -; CHECK-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; CHECK-NEXT: retq - %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> ) + %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0 }