diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4683,6 +4683,17 @@ EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; + // Simplify SETCC testing shifted value for equality/non-equality to zero by + // removing redundant operations generated during shift's expansion. + // Shift's expansion (when its result fed into SETCC eq/ne 0) generates tree + // consisting of OR and multiple SRL/SHL (that may be combined into + // FSHL/FSHR). When such pair of shifts applied to the same operand it + // performs rotation and it could be eliminated as far as the overall result + // is compared with zero. + SDValue optimizeSetCCOfExpandedShift(EVT SCCVT, SDValue N0, SDValue N1C, + ISD::CondCode Cond, DAGCombinerInfo &DCI, + const SDLoc &DL) const; + SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3390,6 +3390,138 @@ return T2; } +// Simplify SETCC testing shifted value for equality/non-equality to zero by +// removing redundant operations generated during shift's expansion. +// Shift's expansion (when its result fed into SETCC eq/ne 0) generates tree +// consisting of OR and multiple SRL/SHL (that may be combined into FSHL/FSHR). +// When such expressions pair of shifts applied to the same operand it performs +// rotation and it could be eliminated as far as the overall result is compared +// with zero. +// +// Example of redundant shifts elimination: +// (or (or (srl X, C0), (shl Y, C1)), (srl Y, C0)) ==/!= 0 +// --> (or (srl X, C0), Y) ==/!= 0 +// +// (or (or (srl Y, C0), (shl X, C1)), (shl Y, C1)) ==/!= 0 +// --> (or (shl X, C1), Y) ==/!= 0 +// +// (or (srl X, C), (fshr X, Y, C)) ==/!= 0 --> (or (srl Y, C), X) +// +// (or (or (fshl W, X, C), (fshl X, Y, C)), +// (or (fshl Y, Z, C), (shl Z, C)))) ==/!= 0 +// --> (or (or (shl W, C), X), (or Y, Z)) +SDValue TargetLowering::optimizeSetCCOfExpandedShift(EVT SCCVT, SDValue N0, + SDValue N1C, + ISD::CondCode Cond, + DAGCombinerInfo &DCI, + const SDLoc &DL) const { + assert(isConstOrConstSplat(N1C) && + isConstOrConstSplat(N1C)->getAPIntValue().isZero() && + "Should be a comparison with 0."); + assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Unexpected condcode"); + + struct ShiftInfo { + APInt Bits; + bool IsLeft; + }; + SmallDenseMap UnmatchedShifts; + SmallVector Result; + + // Match pairs of shifts applied to the same operand that effectively + // perform its rotation: + // 1) create a new entry in UnmatchedShifts map if Op was observed + // for the first time; + // 2) if UnmatchedShifts map contains an entry for the Op check that + // it was created for a shift in opposite direction and that + // amount of bits in these two shifts is summed up to OpSizeInBits. + unsigned MatchedShiftsCount = 0; + unsigned OpSizeInBits = N0.getValueType().getScalarSizeInBits(); + auto MatchShifts = [&UnmatchedShifts, &Result, &MatchedShiftsCount, + OpSizeInBits](SDValue &Op, APInt C, bool IsLeft) { + ShiftInfo &Info = UnmatchedShifts[Op]; + if (Info.Bits.isZero()) { + Info.Bits = C; + Info.IsLeft = IsLeft; + return true; + } + if (Info.IsLeft == IsLeft) + return false; + if (Info.Bits + C != OpSizeInBits) + return false; + Result.push_back(Op); + UnmatchedShifts.erase(Op); + ++MatchedShiftsCount; + return true; + }; + + // Recursively scan DAG to match all shifts while following conditions are + // met: + // 1) every node should has only one use; + // 2) every shift should be either first shift found for its operand + // or there shoud be previously found matching shift in opposite + // direction; + // 3) HeightLimit should be greater than zero. + std::function Scan; + Scan = [&Scan, &MatchShifts, &Result, OpSizeInBits](const SDValue &Value, + unsigned HeightLimit) { + if (HeightLimit == 0 || !Value->hasOneUse()) + return false; + unsigned Opcode = Value->getOpcode(); + bool IsShiftLeft = Opcode == ISD::SHL || Opcode == ISD::FSHL; + ConstantSDNode *C; + + if (Opcode == ISD::OR) { + return Scan(Value->getOperand(0), HeightLimit - 1) && + Scan(Value->getOperand(1), HeightLimit - 1); + } else if (Opcode == ISD::SRL || Opcode == ISD::SHL) { + if (!(C = dyn_cast(Value->getOperand(1)))) + return false; + SDValue Op = Value->getOperand(0); + return MatchShifts(Op, C->getAPIntValue(), IsShiftLeft); + } else if (Opcode == ISD::FSHL || Opcode == ISD::FSHR) { + if (!(C = dyn_cast(Value->getOperand(2)))) + return false; + SDValue Op1 = Value->getOperand(0); + SDValue Op2 = Value->getOperand(1); + APInt CVal = C->getAPIntValue(); + // For funnel shifts second operand is effectively shifted + // in opposite direction. + return MatchShifts(Op1, CVal, IsShiftLeft) && + MatchShifts(Op2, OpSizeInBits - CVal, !IsShiftLeft); + } + Result.push_back(Value); + return true; + }; + // Shift of i4096 operand legalized to i32 is expanded into a tree that + // has a height of 8, so it should be enough to cover most practical cases. + constexpr unsigned MaxTreeHeight = 8; + if (!Scan(N0, MaxTreeHeight)) + return SDValue(); + // There should be at most one unmatched shift and at least one pair + // of matched shifts. + if (MatchedShiftsCount == 0 || UnmatchedShifts.size() > 1 || Result.empty()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + // Recreate value for unmatched shift. + if (!UnmatchedShifts.empty()) { + auto UnmatchedShift = UnmatchedShifts.begin(); + SDValue Op = UnmatchedShift->first; + ShiftInfo &Info = UnmatchedShift->second; + EVT ShiftTy = EVT::getIntegerVT(*DAG.getContext(), Info.Bits.getBitWidth()); + SDValue Con = DAG.getConstant(Info.Bits, DL, ShiftTy); + unsigned ShiftOpcode = Info.IsLeft ? ISD::SHL : ISD::SRL; + Result.push_back(DAG.getNode(ShiftOpcode, DL, N0.getValueType(), Op, Con)); + } + // Reduce all values using OR. + for (size_t Index = 0; Index + 1 < Result.size(); Index += 2) { + SDValue NewOr = DAG.getNode(ISD::OR, DL, N0.getValueType(), Result[Index], + Result[Index + 1]); + Result.push_back(NewOr); + } + return DAG.getSetCC(DL, SCCVT, Result.back(), N1C, Cond); +} + /// Try to fold an equality comparison with a {add/sub/xor} binary operation as /// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to /// handle the commuted versions of these patterns. @@ -4010,12 +4142,19 @@ } if (Cond == ISD::SETEQ || Cond == ISD::SETNE) { - // (X & (C l>>/<< Y)) ==/!= 0 --> ((X <> Y) & C) ==/!= 0 - if (C1.isZero()) + if (C1.isZero()) { + // (X & (C l>>/<< Y)) ==/!= 0 --> ((X <> Y) & C) ==/!= 0 if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift( VT, N0, N1, Cond, DCI, dl)) return CC; + // Try to simplify expanded shift by removing shift operations + // that effectively perform rotation. + if (SDValue CC = + optimizeSetCCOfExpandedShift(VT, N0, N1, Cond, DCI, dl)) + return CC; + } + // For all/any comparisons, replace or(x,shl(y,bw/2)) with and/or(x,y). // For example, when high 32-bits of i64 X are known clear: // all bits clear: (X | (Y<<32)) == 0 --> (X | Y) == 0 diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll --- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll @@ -12,8 +12,7 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds x0, x0, #1 ; CHECK-NEXT: adcs x1, x1, xzr -; CHECK-NEXT: extr x8, x1, x0, #60 -; CHECK-NEXT: orr x8, x8, x1, lsr #60 +; CHECK-NEXT: orr x8, x1, x0, lsr #60 ; CHECK-NEXT: cbnz x8, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret @@ -32,8 +31,7 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: extr x8, x1, x0, #17 -; CHECK-NEXT: orr x8, x8, x1, lsr #17 +; CHECK-NEXT: orr x8, x1, x0, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -45,8 +43,7 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: extr x8, x1, x0, #17 -; CHECK-NEXT: orr x8, x8, x1, lsr #17 +; CHECK-NEXT: orr x8, x1, x0, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -58,8 +55,7 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: extr x8, x1, x0, #47 -; CHECK-NEXT: orr x8, x8, x0, lsl #17 +; CHECK-NEXT: orr x8, x0, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -71,8 +67,7 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: extr x8, x1, x0, #47 -; CHECK-NEXT: orr x8, x8, x0, lsl #17 +; CHECK-NEXT: orr x8, x0, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -106,8 +101,7 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: // %bb.0: -; CHECK-NEXT: extr x8, x0, x1, #47 -; CHECK-NEXT: orr x8, x8, x1, lsl #17 +; CHECK-NEXT: orr x8, x1, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -139,4 +133,21 @@ ret i1 %cmp } +define i1 @opt_setcc_shl_ne_zero_i256(i256 %a) nounwind { +; CHECK-LABEL: opt_setcc_shl_ne_zero_i256: +; CHECK: // %bb.0: +; CHECK-NEXT: extr x8, x3, x2, #47 +; CHECK-NEXT: extr x9, x2, x1, #47 +; CHECK-NEXT: extr x10, x1, x0, #47 +; CHECK-NEXT: orr x9, x9, x0, lsl #17 +; CHECK-NEXT: orr x8, x10, x8 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shl = shl i256 %a, 17 + %cmp = icmp ne i256 %shl, 0 + ret i1 %cmp +} + declare void @use(i128 %a) diff --git a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll --- a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll +++ b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll @@ -630,14 +630,10 @@ ; CHECKV7M-NEXT: ldrd lr, r0, [sp, #8] ; CHECKV7M-NEXT: beq .LBB6_2 ; CHECKV7M-NEXT: @ %bb.1: @ %then -; CHECKV7M-NEXT: lsrs r2, r2, #17 -; CHECKV7M-NEXT: orr.w r2, r2, r3, lsl #15 -; CHECKV7M-NEXT: orr.w r2, r2, r3, lsr #17 -; CHECKV7M-NEXT: lsr.w r3, r12, #17 -; CHECKV7M-NEXT: orr.w r3, r3, r1, lsl #15 +; CHECKV7M-NEXT: orr.w r2, r3, r2, lsr #17 +; CHECKV7M-NEXT: orr.w r1, r1, r12, lsr #17 ; CHECKV7M-NEXT: cmp r2, #0 ; CHECKV7M-NEXT: mov r2, r0 -; CHECKV7M-NEXT: orr.w r1, r3, r1, lsr #17 ; CHECKV7M-NEXT: it ne ; CHECKV7M-NEXT: movne r2, lr ; CHECKV7M-NEXT: cmp r1, #0 @@ -646,9 +642,7 @@ ; CHECKV7M-NEXT: add r0, r2 ; CHECKV7M-NEXT: pop {r7, pc} ; CHECKV7M-NEXT: .LBB6_2: @ %else -; CHECKV7M-NEXT: lsrs r1, r2, #17 -; CHECKV7M-NEXT: orr.w r1, r1, r3, lsl #15 -; CHECKV7M-NEXT: orr.w r1, r1, r3, lsr #17 +; CHECKV7M-NEXT: orr.w r1, r3, r2, lsr #17 ; CHECKV7M-NEXT: cmp r1, #0 ; CHECKV7M-NEXT: it ne ; CHECKV7M-NEXT: movne r0, lr @@ -664,14 +658,10 @@ ; CHECKV7A-NEXT: lsls r4, r4, #31 ; CHECKV7A-NEXT: beq .LBB6_2 ; CHECKV7A-NEXT: @ %bb.1: @ %then -; CHECKV7A-NEXT: lsrs r2, r2, #17 -; CHECKV7A-NEXT: orr.w r2, r2, r3, lsl #15 -; CHECKV7A-NEXT: orr.w r2, r2, r3, lsr #17 -; CHECKV7A-NEXT: lsr.w r3, r12, #17 -; CHECKV7A-NEXT: orr.w r3, r3, r1, lsl #15 +; CHECKV7A-NEXT: orr.w r2, r3, r2, lsr #17 +; CHECKV7A-NEXT: orr.w r1, r1, r12, lsr #17 ; CHECKV7A-NEXT: cmp r2, #0 ; CHECKV7A-NEXT: mov r2, r0 -; CHECKV7A-NEXT: orr.w r1, r3, r1, lsr #17 ; CHECKV7A-NEXT: it ne ; CHECKV7A-NEXT: movne r2, lr ; CHECKV7A-NEXT: cmp r1, #0 @@ -680,9 +670,7 @@ ; CHECKV7A-NEXT: add r0, r2 ; CHECKV7A-NEXT: pop {r4, pc} ; CHECKV7A-NEXT: .LBB6_2: @ %else -; CHECKV7A-NEXT: lsrs r1, r2, #17 -; CHECKV7A-NEXT: orr.w r1, r1, r3, lsl #15 -; CHECKV7A-NEXT: orr.w r1, r1, r3, lsr #17 +; CHECKV7A-NEXT: orr.w r1, r3, r2, lsr #17 ; CHECKV7A-NEXT: cmp r1, #0 ; CHECKV7A-NEXT: it ne ; CHECKV7A-NEXT: movne r0, lr diff --git a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll --- a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll @@ -12,9 +12,7 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r0, r0, #1 ; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsr r2, r0, #16 -; CHECK-NEXT: orr r2, r2, r1, lsl #16 -; CHECK-NEXT: orr r2, r2, r1, lsr #16 +; CHECK-NEXT: orr r2, r1, r0, lsr #16 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %exit @@ -34,9 +32,7 @@ define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsr r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsl #15 -; CHECK-NEXT: orr r0, r0, r1, lsr #17 +; CHECK-NEXT: orr r0, r1, r0, lsr #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr @@ -48,9 +44,7 @@ define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsr r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsl #15 -; CHECK-NEXT: orr r0, r0, r1, lsr #17 +; CHECK-NEXT: orr r0, r1, r0, lsr #17 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr @@ -62,9 +56,7 @@ define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsl r1, r1, #17 -; CHECK-NEXT: orr r1, r1, r0, lsr #15 -; CHECK-NEXT: orr r0, r1, r0, lsl #17 +; CHECK-NEXT: orr r0, r0, r1, lsl #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr @@ -76,9 +68,7 @@ define i1 @opt_setcc_shl_ne_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsl r1, r1, #17 -; CHECK-NEXT: orr r1, r1, r0, lsr #15 -; CHECK-NEXT: orr r0, r1, r0, lsl #17 +; CHECK-NEXT: orr r0, r0, r1, lsl #17 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr @@ -113,9 +103,7 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsl r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsr #15 -; CHECK-NEXT: orr r0, r0, r1, lsl #17 +; CHECK-NEXT: orr r0, r1, r0, lsl #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr @@ -148,4 +136,17 @@ ret i1 %cmp } +define i1 @opt_setcc_shl_ne_zero_i128(i128 %a) nounwind { +; CHECK-LABEL: opt_setcc_shl_ne_zero_i128: +; CHECK: @ %bb.0: +; CHECK-NEXT: orr r2, r2, r3, lsl #17 +; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: orrs r0, r0, r2 +; CHECK-NEXT: movwne r0, #1 +; CHECK-NEXT: bx lr + %shl = shl i128 %a, 17 + %cmp = icmp ne i128 %shl, 0 + ret i1 %cmp +} + declare void @use(i64 %a) diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -13,34 +13,29 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: addl $1, %ecx +; X86-NEXT: addl $1, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl $4, %edx, %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shldl $4, %esi, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $28, %ecx -; X86-NEXT: orl %ebp, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: shrl $28, %ebp +; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -56,11 +51,9 @@ ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: shldq $4, %rax, %rcx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: shrq $60, %rsi -; X64-NEXT: orq %rcx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $60, %rcx +; X64-NEXT: orq %rdx, %rcx ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: retq @@ -79,30 +72,19 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: shldl $15, %edx, %edi -; X86-NEXT: shldl $15, %ecx, %edx -; X86-NEXT: shrdl $17, %ecx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: shrl $17, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $17, %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shrdq $17, %rsi, %rdi -; X64-NEXT: shrq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shrq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: sete %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -113,30 +95,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: shldl $15, %edx, %edi -; X86-NEXT: shldl $15, %ecx, %edx -; X86-NEXT: shrdl $17, %ecx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: shrl $17, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $17, %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: setne %al -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shrdq $17, %rsi, %rdi -; X64-NEXT: shrq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shrq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: setne %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -147,27 +118,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %esi, %edx -; X86-NEXT: shldl $17, %ecx, %esi -; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: shll $17, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shldq $17, %rdi, %rsi -; X64-NEXT: shlq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shlq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -178,27 +141,19 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %esi, %edx -; X86-NEXT: shldl $17, %ecx, %esi -; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: shll $17, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shldq $17, %rdi, %rsi -; X64-NEXT: shlq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shlq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: setne %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -262,27 +217,19 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %edx, %esi -; X86-NEXT: shldl $17, %ecx, %edx -; X86-NEXT: shldl $17, %eax, %ecx -; X86-NEXT: shll $17, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %esi, %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X64: # %bb.0: -; X64-NEXT: shldq $17, %rsi, %rdi -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl.a = shl i64 %a, 17