Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3796,7 +3796,7 @@ /// Match "(X shl/srl V1) & V2" where V2 may not be present. static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { if (Op.getOpcode() == ISD::AND) { - if (isa(Op.getOperand(1))) { + if (isConstOrConstSplat(Op.getOperand(1))) { Mask = Op.getOperand(1); Op = Op.getOperand(0); } else { @@ -3813,105 +3813,106 @@ } // Return true if we can prove that, whenever Neg and Pos are both in the -// range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos). This means that +// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: // // (or (shift1 X, Neg), (shift2 X, Pos)) // // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate -// in direction shift1 by Neg. The range [0, OpSize) means that we only need +// in direction shift1 by Neg. The range [0, EltSize) means that we only need // to consider shift amounts with defined behavior. -static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) { - // If OpSize is a power of 2 then: +static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) { + // If EltSize is a power of 2 then: // - // (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1) - // (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize). + // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) + // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). // - // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check + // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check // for the stronger condition: // - // Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1) [A] + // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] // - // for all Neg and Pos. Since Neg & (OpSize - 1) == Neg' & (OpSize - 1) + // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) // we can just replace Neg with Neg' for the rest of the function. // // In other cases we check for the even stronger condition: // - // Neg == OpSize - Pos [B] + // Neg == EltSize - Pos [B] // // for all Neg and Pos. Note that the (or ...) then invokes undefined - // behavior if Pos == 0 (and consequently Neg == OpSize). + // behavior if Pos == 0 (and consequently Neg == EltSize). // - // We could actually use [A] whenever OpSize is a power of 2, but the + // We could actually use [A] whenever EltSize is a power of 2, but the // only extra cases that it would match are those uninteresting ones // where Neg and Pos are never in range at the same time. E.g. for - // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) + // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) // as well as (sub 32, Pos), but: // // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) // // always invokes undefined behavior for 32-bit X. // - // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise. + // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. unsigned MaskLoBits = 0; - if (Neg.getOpcode() == ISD::AND && - isPowerOf2_64(OpSize) && - Neg.getOperand(1).getOpcode() == ISD::Constant && - cast(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) { - Neg = Neg.getOperand(0); - MaskLoBits = Log2_64(OpSize); + if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { + if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { + if (NegC->getAPIntValue() == EltSize - 1) { + Neg = Neg.getOperand(0); + MaskLoBits = Log2_64(EltSize); + } + } } // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. if (Neg.getOpcode() != ISD::SUB) return 0; - ConstantSDNode *NegC = dyn_cast(Neg.getOperand(0)); + ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); if (!NegC) return 0; SDValue NegOp1 = Neg.getOperand(1); - // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with + // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. - if (MaskLoBits && - Pos.getOpcode() == ISD::AND && - Pos.getOperand(1).getOpcode() == ISD::Constant && - cast(Pos.getOperand(1))->getAPIntValue() == OpSize - 1) - Pos = Pos.getOperand(0); + if (MaskLoBits && Pos.getOpcode() == ISD::AND) + if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) + if (PosC->getAPIntValue() == EltSize - 1) + Pos = Pos.getOperand(0); // The condition we need is now: // - // (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask + // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask // // If NegOp1 == Pos then we need: // - // OpSize & Mask == NegC & Mask + // EltSize & Mask == NegC & Mask // // (because "x & Mask" is a truncation and distributes through subtraction). APInt Width; if (Pos == NegOp1) Width = NegC->getAPIntValue(); + // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. // Then the condition we want to prove becomes: // - // (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask + // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask // // which, again because "x & Mask" is a truncation, becomes: // - // NegC & Mask == (OpSize - PosC) & Mask - // OpSize & Mask == (NegC + PosC) & Mask - else if (Pos.getOpcode() == ISD::ADD && - Pos.getOperand(0) == NegOp1 && - Pos.getOperand(1).getOpcode() == ISD::Constant) - Width = (cast(Pos.getOperand(1))->getAPIntValue() + - NegC->getAPIntValue()); - else + // NegC & Mask == (EltSize - PosC) & Mask + // EltSize & Mask == (NegC + PosC) & Mask + else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { + if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) + Width = PosC->getAPIntValue() + NegC->getAPIntValue(); + else + return false; + } else return false; - // Now we just need to check that OpSize & Mask == Width & Mask. + // Now we just need to check that EltSize & Mask == Width & Mask. if (MaskLoBits) - // Opsize & Mask is 0 since Mask is Opsize - 1. + // EltSize & Mask is 0 since Mask is EltSize - 1. return Width.getLoBits(MaskLoBits) == 0; - return Width == OpSize; + return Width == EltSize; } // A subroutine of MatchRotate used once we have found an OR of two opposite @@ -3931,7 +3932,7 @@ // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); - if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) { + if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); @@ -3974,10 +3975,10 @@ if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); std::swap(LHSShift, RHSShift); - std::swap(LHSMask , RHSMask ); + std::swap(LHSMask, RHSMask); } - unsigned OpSizeInBits = VT.getSizeInBits(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue LHSShiftArg = LHSShift.getOperand(0); SDValue LHSShiftAmt = LHSShift.getOperand(1); SDValue RHSShiftArg = RHSShift.getOperand(0); @@ -3985,11 +3986,10 @@ // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - if (LHSShiftAmt.getOpcode() == ISD::Constant && - RHSShiftAmt.getOpcode() == ISD::Constant) { - uint64_t LShVal = cast(LHSShiftAmt)->getZExtValue(); - uint64_t RShVal = cast(RHSShiftAmt)->getZExtValue(); - if ((LShVal + RShVal) != OpSizeInBits) + if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) { + uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue(); + uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue(); + if ((LShVal + RShVal) != EltSizeInBits) return nullptr; SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, @@ -3997,15 +3997,15 @@ // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - APInt Mask = APInt::getAllOnesValue(OpSizeInBits); + APInt Mask = APInt::getAllOnesValue(EltSizeInBits); if (LHSMask.getNode()) { - APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal); - Mask &= cast(LHSMask)->getAPIntValue() | RHSBits; + APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); + Mask &= isConstOrConstSplat(LHSMask)->getAPIntValue() | RHSBits; } if (RHSMask.getNode()) { - APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal); - Mask &= cast(RHSMask)->getAPIntValue() | LHSBits; + APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal); + Mask &= isConstOrConstSplat(RHSMask)->getAPIntValue() | LHSBits; } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, DL, VT)); Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1050,6 +1050,17 @@ setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + if (Subtarget->hasXOP()) { + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + setOperationAction(ISD::ROTL, MVT::v8i16, Custom); + setOperationAction(ISD::ROTL, MVT::v4i32, Custom); + setOperationAction(ISD::ROTL, MVT::v2i64, Custom); + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + setOperationAction(ISD::ROTL, MVT::v16i16, Custom); + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); + setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + } + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -18817,6 +18828,41 @@ return SDValue(); } +static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + unsigned Opc = Op.getOpcode(); + + assert(VT.isVector() && "Custom lowering only for vector rotates!"); + assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert((Opc == ISD::ROTL) && "Only ROTL supported"); + + // XOP has 128-bit vector variable + immediate rotates. + // +ve/-ve Amt = rotate left/right. + + // Split 256-bit integers. + if (VT.getSizeInBits() == 256) + return Lower256IntArith(Op, DAG); + + assert(VT.getSizeInBits() == 128 && "Only rotate 128-bit vectors!"); + + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Use general rotate by variable (per-element). + return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); +} + static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering @@ -19675,6 +19721,7 @@ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); + case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); Index: llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll @@ -74,25 +74,10 @@ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_rotate_v2i64: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_rotate_v2i64: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: var_rotate_v2i64: +; XOP: # BB#0: +; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: var_rotate_v2i64: ; X32-SSE: # BB#0: @@ -219,25 +204,10 @@ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: var_rotate_v4i32: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 -; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_rotate_v4i32: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: var_rotate_v4i32: +; XOP: # BB#0: +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: var_rotate_v4i32: ; X32-SSE: # BB#0: @@ -465,13 +435,7 @@ ; ; XOP-LABEL: var_rotate_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm2 -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: var_rotate_v8i16: @@ -689,13 +653,7 @@ ; ; XOP-LABEL: var_rotate_v16i8: ; XOP: # BB#0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: var_rotate_v16i8: @@ -1321,9 +1279,7 @@ ; ; XOP-LABEL: splatconstant_rotate_v2i64: ; XOP: # BB#0: -; XOP-NEXT: vpsllq $14, %xmm0, %xmm1 -; XOP-NEXT: vpsrlq $50, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotq $14, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_v2i64: @@ -1357,9 +1313,7 @@ ; ; XOP-LABEL: splatconstant_rotate_v4i32: ; XOP: # BB#0: -; XOP-NEXT: vpslld $4, %xmm0, %xmm1 -; XOP-NEXT: vpsrld $28, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotd $4, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_v4i32: @@ -1393,9 +1347,7 @@ ; ; XOP-LABEL: splatconstant_rotate_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpsllw $7, %xmm0, %xmm1 -; XOP-NEXT: vpsrlw $9, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotw $7, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_v8i16: @@ -1433,12 +1385,7 @@ ; ; XOP-LABEL: splatconstant_rotate_v16i8: ; XOP: # BB#0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vprotb $4, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_v16i8: @@ -1483,11 +1430,8 @@ ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # BB#0: -; XOP-NEXT: vpsllq $15, %xmm0, %xmm1 -; XOP-NEXT: vpsrlq $49, %xmm0, %xmm0 +; XOP-NEXT: vprotq $15, %xmm0, %xmm0 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: @@ -1542,22 +1486,15 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0 +; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpslld $4, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrld $28, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32: @@ -1601,11 +1538,8 @@ ; ; XOP-LABEL: splatconstant_rotate_mask_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpsllw $5, %xmm0, %xmm1 -; XOP-NEXT: vpsrlw $11, %xmm0, %xmm0 +; XOP-NEXT: vprotw $5, %xmm0, %xmm0 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16: @@ -1653,14 +1587,8 @@ ; ; XOP-LABEL: splatconstant_rotate_mask_v16i8: ; XOP: # BB#0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vprotb $4, %xmm0, %xmm0 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm1 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8: Index: llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll +++ llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll @@ -47,30 +47,20 @@ ; ; XOPAVX1-LABEL: var_rotate_v4i64: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshlq %xmm4, %xmm5, %xmm4 -; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: var_rotate_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %b64 = sub <4 x i64> , %b %shl = shl <4 x i64> %a, %b @@ -135,30 +125,20 @@ ; ; XOPAVX1-LABEL: var_rotate_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshld %xmm4, %xmm5, %xmm4 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshld %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpshld %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: var_rotate_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %b32 = sub <8 x i32> , %b %shl = shl <8 x i32> %a, %b @@ -262,40 +242,20 @@ ; ; XOPAVX1-LABEL: var_rotate_v16i16: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshlw %xmm4, %xmm5, %xmm4 -; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlw %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: var_rotate_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2 -; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOPAVX2-NEXT: vpsubw %xmm3, %xmm5, %xmm3 -; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %b16 = sub <16 x i16> , %b %shl = shl <16 x i16> %a, %b @@ -400,40 +360,20 @@ ; ; XOPAVX1-LABEL: var_rotate_v32i8: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm4 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: var_rotate_v32i8: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOPAVX2-NEXT: vpsubb %xmm3, %xmm5, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %b8 = sub <32 x i8> , %b %shl = shl <32 x i8> %a, %b @@ -789,21 +729,18 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_v4i64: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpsllq $14, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrlq $50, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrlq $50, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlq $50, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <4 x i64> %a, %lshr = lshr <4 x i64> %a, @@ -833,21 +770,18 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <8 x i32> %a, %lshr = lshr <8 x i32> %a, @@ -877,21 +811,18 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_v16i16: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpsllw $7, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrlw $9, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrlw $9, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlw $9, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <16 x i16> %a, %lshr = lshr <16 x i16> %a, @@ -929,26 +860,18 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_v32i8: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_v32i8: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, @@ -989,28 +912,22 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpsllq $15, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpsllq $15, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrlq $49, %xmm2, %xmm2 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vprotq $15, %xmm1, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllq $15, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 +; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <4 x i64> %a, %lshr = lshr <4 x i64> %a, @@ -1048,27 +965,21 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpslld $4, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrld $28, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrld $28, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrld $28, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; XOPAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <8 x i32> %a, %lshr = lshr <8 x i32> %a, @@ -1104,25 +1015,20 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpsllw $5, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpsllw $5, %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrlw $11, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllw $5, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlw $11, %ymm0, %ymm0 +; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <16 x i16> %a, %lshr = lshr <16 x i16> %a, @@ -1166,30 +1072,20 @@ ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm1 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm1 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a,