Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -483,9 +483,6 @@ /// returns false. bool findBetterNeighborChains(StoreSDNode *St); - /// Match "(X shl/srl V1) & V2" where V2 may not be present. - bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask); - /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { @@ -5148,25 +5145,140 @@ return SDValue(); } -/// Match "(X shl/srl V1) & V2" where V2 may not be present. -bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { - if (Op.getOpcode() == ISD::AND) { - if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { - Mask = Op.getOperand(1); - Op = Op.getOperand(0); - } else { - return false; - } +static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) { + if (Op.getOpcode() == ISD::AND && + DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { + Mask = Op.getOperand(1); + return Op.getOperand(0); } + return Op; +} +/// Match "(X shl/srl V1) & V2" where V2 may not be present. +static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift, + SDValue &Mask) { + Op = stripConstantMask(DAG, Op, Mask); if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { Shift = Op; return true; } - return false; } +/// Helper function for visitOR to extract the needed side of a rotate idiom +/// from a shl/srl/mul/udiv. This is meant to handle cases where +/// InstCombine merged some outside op with one of the shifts from +/// the rotate pattern. +/// \returns An empty \c SDValue if the needed shift couldn't be extracted. +/// Otherwise, returns an expansion of \p ExtractFrom based on the following +/// patterns: +/// +/// (or (mul v c0) (shrl (mul v c1) c2)): +/// expands (mul v c0) -> (shl (mul v c1) c3) +/// +/// (or (udiv v c0) (shl (udiv v c1) c2)): +/// expands (udiv v c0) -> (shrl (udiv v c1) c3) +/// +/// (or (shl v c0) (shrl (shl v c1) c2)): +/// expands (shl v c0) -> (shl (shl v c1) c3) +/// +/// (or (shrl v c0) (shl (shrl v c1) c2)): +/// expands (shrl v c0) -> (shrl (shrl v c1) c3) +/// +/// Such that in all cases, c3+c2==bitwidth(op v c1). +static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, + SDValue ExtractFrom, SDValue &Mask, + const SDLoc &DL) { + assert(OppShift && ExtractFrom && "Empty SDValue"); + assert( + (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) && + "Existing shift must be valid as a rotate half"); + + ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask); + // Preconditions: + // (or (op0 v c0) (shiftl/r (op0 v c1) c2)) + // + // Find opcode of the needed shift to be extracted from (op0 v c0). + unsigned Opcode = ISD::DELETED_NODE; + bool IsMulOrDiv = false; + // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift + // opcode or its arithmetic (mul or udiv) variant. + auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) { + IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant; + if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift) + return false; + Opcode = NeededShift; + return true; + }; + // op0 must be either the needed shift opcode or the mul/udiv equivalent + // that the needed shift can be extracted from. + if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) && + (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV))) + return SDValue(); + + // op0 must be the same opcode on both sides, have the same LHS argument, + // and produce the same value type. + SDValue OppShiftLHS = OppShift.getOperand(0); + EVT ShiftedVT = OppShiftLHS.getValueType(); + if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() || + OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) || + ShiftedVT != ExtractFrom.getValueType()) + return SDValue(); + + // Amount of the existing shift. + ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1)); + // Constant mul/udiv/shift amount from the RHS of the shift's LHS op. + ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1)); + // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op. + ConstantSDNode *ExtractFromCst = + isConstOrConstSplat(ExtractFrom.getOperand(1)); + // TODO: We should be able to handle non-uniform constant vectors for these values + // Check that we have constant values. + if (!OppShiftCst || !OppShiftCst->getAPIntValue() || + !OppLHSCst || !OppLHSCst->getAPIntValue() || + !ExtractFromCst || !ExtractFromCst->getAPIntValue()) + return SDValue(); + + // Compute the shift amount we need to extract to complete the rotate. + const unsigned VTWidth = ShiftedVT.getScalarSizeInBits(); + APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue(); + if (NeededShiftAmt.isNegative()) + return SDValue(); + // Normalize the bitwidth of the two mul/udiv/shift constant operands. + APInt ExtractFromAmt = ExtractFromCst->getAPIntValue(); + APInt OppLHSAmt = OppLHSCst->getAPIntValue(); + zeroExtendToMatch(ExtractFromAmt, OppLHSAmt); + + // Now try extract the needed shift from the ExtractFrom op and see if the + // result matches up with the existing shift's LHS op. + if (IsMulOrDiv) { + // Op to extract from is a mul or udiv by a constant. + // Check: + // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0 + // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0 + const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(), + NeededShiftAmt.getZExtValue()); + APInt ResultAmt; + APInt Rem; + APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem); + if (Rem != 0 || ResultAmt != OppLHSAmt) + return SDValue(); + } else { + // Op to extract from is a shift by a constant. + // Check: + // c2 - (bitwidth(op0 v c0) - c1) == c0 + if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc( + ExtractFromAmt.getBitWidth())) + return SDValue(); + } + + // Return the expanded shift op that should allow a rotate to be formed. + EVT ShiftVT = OppShift.getOperand(1).getValueType(); + EVT ResVT = ExtractFrom.getValueType(); + SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT); + return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode); +} + // Return true if we can prove that, whenever Neg and Pos are both in the // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: @@ -5333,14 +5445,41 @@ // Match "(X shl/srl V1) & V2" where V2 may not be present. SDValue LHSShift; // The shift. SDValue LHSMask; // AND value if any. - if (!MatchRotateHalf(LHS, LHSShift, LHSMask)) - return nullptr; // Not part of a rotate. + matchRotateHalf(DAG, LHS, LHSShift, LHSMask); SDValue RHSShift; // The shift. SDValue RHSMask; // AND value if any. - if (!MatchRotateHalf(RHS, RHSShift, RHSMask)) - return nullptr; // Not part of a rotate. + matchRotateHalf(DAG, RHS, RHSShift, RHSMask); + + // If neither side matched a rotate half, bail + if (!LHSShift && !RHSShift) + return nullptr; + + // InstCombine may have combined a constant shl, srl, mul, or udiv with one + // side of the rotate, so try to handle that here. In all cases we need to + // pass the matched shift from the opposite side to compute the opcode and + // needed shift amount to extract. We still want to do this if both sides + // matched a rotate half because one half may be a potential overshift that + // can be broken down (ie if InstCombine merged two shl or srl ops into a + // single one). + + // Have LHS side of the rotate, try to extract the needed shift from the RHS. + if (LHSShift) + if (SDValue NewRHSShift = + extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL)) + RHSShift = NewRHSShift; + // Have RHS side of the rotate, try to extract the needed shift from the LHS. + if (RHSShift) + if (SDValue NewLHSShift = + extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL)) + LHSShift = NewLHSShift; + + // If a side is still missing, nothing else we can do. + if (!RHSShift || !LHSShift) + return nullptr; + // At this point we've matched or extracted a shift op on each side. + if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) return nullptr; // Not shifting the same value. Index: llvm/trunk/test/CodeGen/AArch64/rotate-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/rotate-extract.ll +++ llvm/trunk/test/CodeGen/AArch64/rotate-extract.ll @@ -11,9 +11,8 @@ define i64 @ror_extract_shl(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_shl: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl x8, x0, #10 -; CHECK-NEXT: bfxil x8, x0, #54, #7 -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: lsl x8, x0, #3 +; CHECK-NEXT: ror x0, x8, #57 ; CHECK-NEXT: ret %lhs_mul = shl i64 %i, 3 %rhs_mul = shl i64 %i, 10 @@ -25,8 +24,8 @@ define i32 @ror_extract_shrl(i32 %i) nounwind { ; CHECK-LABEL: ror_extract_shrl: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #7 -; CHECK-NEXT: and w0, w8, #0xf1ffffff +; CHECK-NEXT: lsr w8, w0, #3 +; CHECK-NEXT: ror w0, w8, #4 ; CHECK-NEXT: ret %lhs_div = lshr i32 %i, 7 %rhs_div = lshr i32 %i, 3 @@ -54,8 +53,8 @@ ; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: umulh x8, x0, x8 -; CHECK-NEXT: ror x8, x8, #5 -; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ror x0, x8, #4 ; CHECK-NEXT: ret %lhs_div = udiv i64 %i, 3 %rhs_div = udiv i64 %i, 48 @@ -67,11 +66,9 @@ define i64 @ror_extract_mul_with_mask(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_mul_with_mask: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w0, lsl #3 -; CHECK-NEXT: lsl w8, w8, #7 -; CHECK-NEXT: add x9, x0, x0, lsl #3 -; CHECK-NEXT: and x0, x8, #0x80 -; CHECK-NEXT: bfxil x0, x9, #57, #7 +; CHECK-NEXT: add x8, x0, x0, lsl #3 +; CHECK-NEXT: ror x8, x8, #57 +; CHECK-NEXT: and x0, x8, #0xff ; CHECK-NEXT: ret %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 Index: llvm/trunk/test/CodeGen/X86/rotate-extract-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/rotate-extract-vector.ll +++ llvm/trunk/test/CodeGen/X86/rotate-extract-vector.ll @@ -12,10 +12,10 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { ; CHECK-LABEL: vroll_v4i32_extract_shl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 -; CHECK-NEXT: vpslld $10, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 +; CHECK-NEXT: vprold $7, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = shl <4 x i32> %i, %rhs_mul = shl <4 x i32> %i, @@ -25,20 +25,12 @@ } define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { -; X86-LABEL: vrolq_v4i64_extract_shrl: -; X86: # %bb.0: -; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vprolq $24, %zmm0, %zmm0 -; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 -; X86-NEXT: retl -; -; X64-LABEL: vrolq_v4i64_extract_shrl: -; X64: # %bb.0: -; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vprolq $24, %zmm0, %zmm0 -; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919] -; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-NEXT: retq +; CHECK-LABEL: vrolq_v4i64_extract_shrl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 +; CHECK-NEXT: vprolq $29, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: ret{{[l|q]}} %lhs_div = lshr <4 x i64> %i, %rhs_div = lshr <4 x i64> %i, %rhs_shift = shl <4 x i64> %rhs_div, @@ -49,12 +41,10 @@ define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { ; CHECK-LABEL: vroll_extract_mul: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640] -; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10] -; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] +; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vprold $6, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = mul <8 x i32> %i, %rhs_mul = mul <8 x i32> %i, @@ -66,7 +56,7 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $44, %esp ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: vmovss %xmm0, (%esp) @@ -85,53 +75,27 @@ ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: vpsllq $57, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: addl $60, %esp +; X86-NEXT: vprolq $57, %zmm0, %zmm0 +; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; X86-NEXT: addl $44, %esp +; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: vrolq_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm1 ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: vmovq %rax, %xmm0 +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm0 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: shrq $8, %rcx -; X64-NEXT: vmovq %rcx, %xmm1 -; X64-NEXT: shrq $8, %rdx -; X64-NEXT: vmovq %rdx, %xmm2 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X64-NEXT: vpsllq $57, %xmm0, %xmm0 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vprolq $57, %zmm0, %zmm0 +; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; X64-NEXT: vzeroupper ; X64-NEXT: retq %lhs_div = udiv <2 x i64> %i, %rhs_div = udiv <2 x i64> %i, @@ -141,17 +105,23 @@ } define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { -; CHECK-LABEL: vrolw_extract_mul_with_mask: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152] -; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9] -; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vrolw_extract_mul_with_mask: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X86-NEXT: vprold $7, %zmm0, %zmm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: vrolw_extract_mul_with_mask: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vprold $7, %zmm0, %zmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq %lhs_mul = mul <4 x i32> %i, %rhs_mul = mul <4 x i32> %i, %lhs_and = and <4 x i32> %lhs_mul, Index: llvm/trunk/test/CodeGen/X86/rotate-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/rotate-extract.ll +++ llvm/trunk/test/CodeGen/X86/rotate-extract.ll @@ -24,9 +24,7 @@ ; X64-LABEL: rolq_extract_shl: ; X64: # %bb.0: ; X64-NEXT: leaq (,%rdi,8), %rax -; X64-NEXT: shlq $10, %rdi -; X64-NEXT: shrq $57, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: rolq $7, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 3 %rhs_mul = shl i64 %i, 10 @@ -39,16 +37,17 @@ ; X86-LABEL: rolw_extract_shrl: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: rolw $9, %ax -; X86-NEXT: andl $61951, %eax # imm = 0xF1FF +; X86-NEXT: shrl $3, %eax +; X86-NEXT: rolw $12, %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolw_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: rolw $9, %di -; X64-NEXT: andl $61951, %edi # imm = 0xF1FF -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: rolw $12, %ax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %lhs_div = lshr i16 %i, 7 %rhs_div = lshr i16 %i, 3 @@ -60,22 +59,16 @@ define i32 @roll_extract_mul(i32 %i) nounwind { ; X86-LABEL: roll_extract_mul: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: shll $7, %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %ecx -; X86-NEXT: shrl $25, %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: roll $7, %eax ; X86-NEXT: retl ; ; X64-LABEL: roll_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: shll $7, %edi -; X64-NEXT: leal (%rdi,%rdi,8), %ecx -; X64-NEXT: shrl $25, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: roll $7, %eax ; X64-NEXT: retq %lhs_mul = mul i32 %i, 9 %rhs_mul = mul i32 %i, 1152 @@ -89,11 +82,8 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull $171, %eax, %eax -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shlb $3, %cl -; X86-NEXT: andb $-16, %cl -; X86-NEXT: shrl $13, %eax -; X86-NEXT: orb %cl, %al +; X86-NEXT: shrl $9, %eax +; X86-NEXT: rolb $4, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -101,12 +91,8 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $171, %eax, %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $8, %ecx -; X64-NEXT: shlb $3, %cl -; X64-NEXT: andb $-16, %cl -; X64-NEXT: shrl $13, %eax -; X64-NEXT: orb %cl, %al +; X64-NEXT: shrl $9, %eax +; X64-NEXT: rolb $4, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 @@ -139,12 +125,8 @@ ; X64-LABEL: rolq_extract_mul_with_mask: ; X64: # %bb.0: ; X64-NEXT: leaq (%rdi,%rdi,8), %rax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi -; X64-NEXT: shll $7, %edi -; X64-NEXT: leal (%rdi,%rdi,8), %ecx -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: shrq $57, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: rolq $7, %rax +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9