diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4683,6 +4683,17 @@
       EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
       DAGCombinerInfo &DCI, const SDLoc &DL) const;
 
+  // Simplify SETCC testing shifted value for equality/non-equality to zero by
+  // removing redundant operations generated during shift's expansion.
+  // Shift's expansion (when its result fed into SETCC eq/ne 0) generates tree
+  // consisting of OR and multiple SRL/SHL (that may be combined into
+  // FSHL/FSHR). When such pair of shifts applied to the same operand it
+  // performs rotation and it could be eliminated as far as the overall result
+  // is compared with zero.
+  SDValue optimizeSetCCOfExpandedShift(EVT SCCVT, SDValue N0, SDValue N1C,
+                                       ISD::CondCode Cond, DAGCombinerInfo &DCI,
+                                       const SDLoc &DL) const;
+
   SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                             SDValue CompTargetNode, ISD::CondCode Cond,
                             DAGCombinerInfo &DCI, const SDLoc &DL,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3390,6 +3390,132 @@
   return T2;
 }
 
+// Example of redundant shifts elimination:
+// (or (or (srl X, C0), (shl Y, C1)), (srl Y, C0)) ==/!= 0
+//    -->  (or (srl X, C0), Y) ==/!= 0
+//
+// (or (or (srl Y, C0), (shl X, C1)), (shl Y, C1)) ==/!= 0
+//    -->  (or (shl X, C1), Y) ==/!= 0
+//
+// (or (srl X, C), (fshr X, Y, C)) ==/!= 0  -->  (or (srl Y, C), X)
+//
+// (or (or (fshl W, X, C), (fshl X, Y, C)),
+//     (or (fshl Y, Z, C), (shl Z, C)))) ==/!= 0
+//    -->  (or (or (shl W, C), X), (or Y, Z))
+SDValue TargetLowering::optimizeSetCCOfExpandedShift(EVT SCCVT, SDValue N0,
+                                                     SDValue N1C,
+                                                     ISD::CondCode Cond,
+                                                     DAGCombinerInfo &DCI,
+                                                     const SDLoc &DL) const {
+  assert(isConstOrConstSplat(N1C) &&
+         isConstOrConstSplat(N1C)->getAPIntValue().isZero() &&
+         "Should be a comparison with 0.");
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Unexpected condcode");
+
+  struct ShiftInfo {
+    APInt Bits;
+    bool IsLeft;
+  };
+  SmallDenseMap<SDValue, ShiftInfo, 16> UnmatchedShifts;
+  SmallVector<SDValue, 16> Result;
+
+  // Match pairs of shifts applied to the same operand that effectively
+  // perform its rotation:
+  // 1) create a new entry in UnmatchedShifts map if Op was observed
+  //    for the first time;
+  // 2) if UnmatchedShifts map contains an entry for the Op check that
+  //    it was created for a shift in opposite direction and that
+  //    amount of bits in these two shifts is summed up to OpSizeInBits.
+  unsigned MatchedShiftsCount = 0;
+  unsigned OpSizeInBits = N0.getValueType().getScalarSizeInBits();
+  auto MatchShifts = [&UnmatchedShifts, &Result, &MatchedShiftsCount,
+                      OpSizeInBits](SDValue &Op, const APInt &C, bool IsLeft) {
+    ShiftInfo &Info = UnmatchedShifts[Op];
+    if (Info.Bits.isZero()) {
+      Info.Bits = C;
+      Info.IsLeft = IsLeft;
+      return true;
+    }
+    if (Info.IsLeft == IsLeft)
+      return false;
+    if (Info.Bits + C != OpSizeInBits)
+      return false;
+    Result.push_back(Op);
+    UnmatchedShifts.erase(Op);
+    ++MatchedShiftsCount;
+    return true;
+  };
+
+  // Recursively scan DAG to match all shifts while following conditions are
+  // met:
+  // 1) every node should has only one use;
+  // 2) every shift should be either first shift found for its operand
+  //    or there shoud be previously found matching shift in opposite
+  //    direction;
+  // 3) Depth should be lower than SelectionDAG::MaxRecursionDepth
+  std::function<bool(const SDValue &, unsigned)> Scan;
+  Scan = [&Scan, &MatchShifts, &Result, OpSizeInBits](const SDValue &Value,
+                                                      unsigned Depth) {
+    if (Depth >= SelectionDAG::MaxRecursionDepth || !Value->hasOneUse())
+      return false;
+    unsigned Opcode = Value->getOpcode();
+    bool IsShiftLeft = Opcode == ISD::SHL || Opcode == ISD::FSHL;
+    ConstantSDNode *C;
+
+    if (Opcode == ISD::OR) {
+      return Scan(Value->getOperand(0), Depth + 1) &&
+             Scan(Value->getOperand(1), Depth + 1);
+    }
+    if (Opcode == ISD::SRL || Opcode == ISD::SHL) {
+      if (!(C = dyn_cast<ConstantSDNode>(Value->getOperand(1))))
+        return false;
+      SDValue Op = Value->getOperand(0);
+      return MatchShifts(Op, C->getAPIntValue(), IsShiftLeft);
+    }
+    if (Opcode == ISD::FSHL || Opcode == ISD::FSHR) {
+      if (!(C = dyn_cast<ConstantSDNode>(Value->getOperand(2))))
+        return false;
+      SDValue Op1 = Value->getOperand(0);
+      SDValue Op2 = Value->getOperand(1);
+      const APInt &CVal = C->getAPIntValue();
+      // For funnel shifts second operand is effectively shifted
+      // in opposite direction.
+      return MatchShifts(Op1, CVal, IsShiftLeft) &&
+             MatchShifts(Op2, OpSizeInBits - CVal, !IsShiftLeft);
+    }
+    Result.push_back(Value);
+    return true;
+  };
+  if (!Scan(N0, 0))
+    return SDValue();
+  // There should be at most one unmatched shift and at least one pair
+  // of matched shifts.
+  if (MatchedShiftsCount == 0 || UnmatchedShifts.size() > 1 || Result.empty())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Recreate value for unmatched shift.
+  if (!UnmatchedShifts.empty()) {
+    DenseMap<SDValue, ShiftInfo>::iterator UnmatchedShift =
+        UnmatchedShifts.begin();
+    SDValue Op = UnmatchedShift->first;
+    ShiftInfo &Info = UnmatchedShift->second;
+    EVT ShiftTy = EVT::getIntegerVT(*DAG.getContext(), Info.Bits.getBitWidth());
+    SDValue Con = DAG.getConstant(Info.Bits, DL, ShiftTy);
+    unsigned ShiftOpcode = Info.IsLeft ? ISD::SHL : ISD::SRL;
+    Result.push_back(DAG.getNode(ShiftOpcode, DL, N0.getValueType(), Op, Con));
+  }
+  // Reduce all values using OR.
+  // Push new OR back to the Result list and combine pairs of values from it
+  // to generate balanced tree and shorten the critical path.
+  for (size_t Index = 0; Index + 1 < Result.size(); Index += 2) {
+    SDValue NewOr = DAG.getNode(ISD::OR, DL, N0.getValueType(), Result[Index],
+                                Result[Index + 1]);
+    Result.push_back(NewOr);
+  }
+  return DAG.getSetCC(DL, SCCVT, Result.back(), N1C, Cond);
+}
+
 /// Try to fold an equality comparison with a {add/sub/xor} binary operation as
 /// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
 /// handle the commuted versions of these patterns.
@@ -4010,12 +4136,19 @@
     }
 
     if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
-      // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
-      if (C1.isZero())
+      if (C1.isZero()) {
+        // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
         if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
                 VT, N0, N1, Cond, DCI, dl))
           return CC;
 
+        // Try to simplify expanded shift by removing shift operations
+        // that effectively perform rotation.
+        if (SDValue CC =
+                optimizeSetCCOfExpandedShift(VT, N0, N1, Cond, DCI, dl))
+          return CC;
+      }
+
       // For all/any comparisons, replace or(x,shl(y,bw/2)) with and/or(x,y).
       // For example, when high 32-bits of i64 X are known clear:
       // all bits clear: (X | (Y<<32)) ==  0 --> (X | Y) ==  0
diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll
--- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll
@@ -12,8 +12,7 @@
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds x0, x0, #1
 ; CHECK-NEXT:    adcs x1, x1, xzr
-; CHECK-NEXT:    extr x8, x1, x0, #60
-; CHECK-NEXT:    orr x8, x8, x1, lsr #60
+; CHECK-NEXT:    orr x8, x1, x0, lsr #60
 ; CHECK-NEXT:    cbnz x8, .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
@@ -32,8 +31,7 @@
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_srl_eq_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extr x8, x1, x0, #17
-; CHECK-NEXT:    orr x8, x8, x1, lsr #17
+; CHECK-NEXT:    orr x8, x1, x0, lsr #17
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
@@ -45,8 +43,7 @@
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_srl_ne_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extr x8, x1, x0, #17
-; CHECK-NEXT:    orr x8, x8, x1, lsr #17
+; CHECK-NEXT:    orr x8, x1, x0, lsr #17
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
@@ -58,8 +55,7 @@
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_shl_eq_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extr x8, x1, x0, #47
-; CHECK-NEXT:    orr x8, x8, x0, lsl #17
+; CHECK-NEXT:    orr x8, x0, x1, lsl #17
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
@@ -71,8 +67,7 @@
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_shl_ne_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extr x8, x1, x0, #47
-; CHECK-NEXT:    orr x8, x8, x0, lsl #17
+; CHECK-NEXT:    orr x8, x0, x1, lsl #17
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
@@ -106,8 +101,7 @@
 define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extr x8, x0, x1, #47
-; CHECK-NEXT:    orr x8, x8, x1, lsl #17
+; CHECK-NEXT:    orr x8, x1, x0, lsl #17
 ; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
@@ -139,4 +133,21 @@
   ret i1 %cmp
 }
 
+define i1 @opt_setcc_shl_ne_zero_i256(i256 %a) nounwind {
+; CHECK-LABEL: opt_setcc_shl_ne_zero_i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr x8, x3, x2, #47
+; CHECK-NEXT:    extr x9, x2, x1, #47
+; CHECK-NEXT:    extr x10, x1, x0, #47
+; CHECK-NEXT:    orr x9, x9, x0, lsl #17
+; CHECK-NEXT:    orr x8, x10, x8
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+   %shl = shl i256 %a, 17
+   %cmp = icmp ne i256 %shl, 0
+   ret i1 %cmp
+}
+
 declare void @use(i128 %a)
diff --git a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll
--- a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll
+++ b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll
@@ -630,14 +630,10 @@
 ; CHECKV7M-NEXT:    ldrd lr, r0, [sp, #8]
 ; CHECKV7M-NEXT:    beq .LBB6_2
 ; CHECKV7M-NEXT:  @ %bb.1: @ %then
-; CHECKV7M-NEXT:    lsrs r2, r2, #17
-; CHECKV7M-NEXT:    orr.w r2, r2, r3, lsl #15
-; CHECKV7M-NEXT:    orr.w r2, r2, r3, lsr #17
-; CHECKV7M-NEXT:    lsr.w r3, r12, #17
-; CHECKV7M-NEXT:    orr.w r3, r3, r1, lsl #15
+; CHECKV7M-NEXT:    orr.w r2, r3, r2, lsr #17
+; CHECKV7M-NEXT:    orr.w r1, r1, r12, lsr #17
 ; CHECKV7M-NEXT:    cmp r2, #0
 ; CHECKV7M-NEXT:    mov r2, r0
-; CHECKV7M-NEXT:    orr.w r1, r3, r1, lsr #17
 ; CHECKV7M-NEXT:    it ne
 ; CHECKV7M-NEXT:    movne r2, lr
 ; CHECKV7M-NEXT:    cmp r1, #0
@@ -646,9 +642,7 @@
 ; CHECKV7M-NEXT:    add r0, r2
 ; CHECKV7M-NEXT:    pop {r7, pc}
 ; CHECKV7M-NEXT:  .LBB6_2: @ %else
-; CHECKV7M-NEXT:    lsrs r1, r2, #17
-; CHECKV7M-NEXT:    orr.w r1, r1, r3, lsl #15
-; CHECKV7M-NEXT:    orr.w r1, r1, r3, lsr #17
+; CHECKV7M-NEXT:    orr.w r1, r3, r2, lsr #17
 ; CHECKV7M-NEXT:    cmp r1, #0
 ; CHECKV7M-NEXT:    it ne
 ; CHECKV7M-NEXT:    movne r0, lr
@@ -664,14 +658,10 @@
 ; CHECKV7A-NEXT:    lsls r4, r4, #31
 ; CHECKV7A-NEXT:    beq .LBB6_2
 ; CHECKV7A-NEXT:  @ %bb.1: @ %then
-; CHECKV7A-NEXT:    lsrs r2, r2, #17
-; CHECKV7A-NEXT:    orr.w r2, r2, r3, lsl #15
-; CHECKV7A-NEXT:    orr.w r2, r2, r3, lsr #17
-; CHECKV7A-NEXT:    lsr.w r3, r12, #17
-; CHECKV7A-NEXT:    orr.w r3, r3, r1, lsl #15
+; CHECKV7A-NEXT:    orr.w r2, r3, r2, lsr #17
+; CHECKV7A-NEXT:    orr.w r1, r1, r12, lsr #17
 ; CHECKV7A-NEXT:    cmp r2, #0
 ; CHECKV7A-NEXT:    mov r2, r0
-; CHECKV7A-NEXT:    orr.w r1, r3, r1, lsr #17
 ; CHECKV7A-NEXT:    it ne
 ; CHECKV7A-NEXT:    movne r2, lr
 ; CHECKV7A-NEXT:    cmp r1, #0
@@ -680,9 +670,7 @@
 ; CHECKV7A-NEXT:    add r0, r2
 ; CHECKV7A-NEXT:    pop {r4, pc}
 ; CHECKV7A-NEXT:  .LBB6_2: @ %else
-; CHECKV7A-NEXT:    lsrs r1, r2, #17
-; CHECKV7A-NEXT:    orr.w r1, r1, r3, lsl #15
-; CHECKV7A-NEXT:    orr.w r1, r1, r3, lsr #17
+; CHECKV7A-NEXT:    orr.w r1, r3, r2, lsr #17
 ; CHECKV7A-NEXT:    cmp r1, #0
 ; CHECKV7A-NEXT:    it ne
 ; CHECKV7A-NEXT:    movne r0, lr
diff --git a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll
--- a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll
@@ -12,9 +12,7 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r0, r0, #1
 ; CHECK-NEXT:    adc r1, r1, #0
-; CHECK-NEXT:    lsr r2, r0, #16
-; CHECK-NEXT:    orr r2, r2, r1, lsl #16
-; CHECK-NEXT:    orr r2, r2, r1, lsr #16
+; CHECK-NEXT:    orr r2, r1, r0, lsr #16
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    bne .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %exit
@@ -34,9 +32,7 @@
 define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_srl_eq_zero:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsr r0, r0, #17
-; CHECK-NEXT:    orr r0, r0, r1, lsl #15
-; CHECK-NEXT:    orr r0, r0, r1, lsr #17
+; CHECK-NEXT:    orr r0, r1, r0, lsr #17
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    lsr r0, r0, #5
 ; CHECK-NEXT:    bx lr
@@ -48,9 +44,7 @@
 define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_srl_ne_zero:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsr r0, r0, #17
-; CHECK-NEXT:    orr r0, r0, r1, lsl #15
-; CHECK-NEXT:    orr r0, r0, r1, lsr #17
+; CHECK-NEXT:    orr r0, r1, r0, lsr #17
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    movwne r0, #1
 ; CHECK-NEXT:    bx lr
@@ -62,9 +56,7 @@
 define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_shl_eq_zero:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsl r1, r1, #17
-; CHECK-NEXT:    orr r1, r1, r0, lsr #15
-; CHECK-NEXT:    orr r0, r1, r0, lsl #17
+; CHECK-NEXT:    orr r0, r0, r1, lsl #17
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    lsr r0, r0, #5
 ; CHECK-NEXT:    bx lr
@@ -76,9 +68,7 @@
 define i1 @opt_setcc_shl_ne_zero(i64 %a) nounwind {
 ; CHECK-LABEL: opt_setcc_shl_ne_zero:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsl r1, r1, #17
-; CHECK-NEXT:    orr r1, r1, r0, lsr #15
-; CHECK-NEXT:    orr r0, r1, r0, lsl #17
+; CHECK-NEXT:    orr r0, r0, r1, lsl #17
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    movwne r0, #1
 ; CHECK-NEXT:    bx lr
@@ -113,9 +103,7 @@
 define i1 @opt_setcc_expanded_shl_correct_shifts(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsl r0, r0, #17
-; CHECK-NEXT:    orr r0, r0, r1, lsr #15
-; CHECK-NEXT:    orr r0, r0, r1, lsl #17
+; CHECK-NEXT:    orr r0, r1, r0, lsl #17
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    lsr r0, r0, #5
 ; CHECK-NEXT:    bx lr
@@ -148,4 +136,17 @@
   ret i1 %cmp
 }
 
+define i1 @opt_setcc_shl_ne_zero_i128(i128 %a) nounwind {
+; CHECK-LABEL: opt_setcc_shl_ne_zero_i128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    orr r2, r2, r3, lsl #17
+; CHECK-NEXT:    orr r0, r1, r0
+; CHECK-NEXT:    orrs r0, r0, r2
+; CHECK-NEXT:    movwne r0, #1
+; CHECK-NEXT:    bx lr
+   %shl = shl i128 %a, 17
+   %cmp = icmp ne i128 %shl, 0
+   ret i1 %cmp
+}
+
 declare void @use(i64 %a)
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -13,34 +13,29 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    addl $1, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shldl $4, %edx, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shldl $4, %esi, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrl $28, %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    shrl $28, %ebp
+; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -56,11 +51,9 @@
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    addq $1, %rax
 ; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    shldq $4, %rax, %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    shrq $60, %rsi
-; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $60, %rcx
+; X64-NEXT:    orq %rdx, %rcx
 ; X64-NEXT:    jne .LBB0_1
 ; X64-NEXT:  # %bb.2: # %exit
 ; X64-NEXT:    retq
@@ -79,30 +72,19 @@
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl $15, %edx, %edi
-; X86-NEXT:    shldl $15, %ecx, %edx
-; X86-NEXT:    shrdl $17, %ecx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    shrl $17, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrdq $17, %rsi, %rdi
-; X64-NEXT:    shrq $17, %rsi
-; X64-NEXT:    orq %rdi, %rsi
+; X64-NEXT:    shrq $17, %rdi
+; X64-NEXT:    orq %rsi, %rdi
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
    %srl = lshr i128 %a, 17
@@ -113,30 +95,19 @@
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl $15, %edx, %edi
-; X86-NEXT:    shldl $15, %ecx, %edx
-; X86-NEXT:    shrdl $17, %ecx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    shrl $17, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrdq $17, %rsi, %rdi
-; X64-NEXT:    shrq $17, %rsi
-; X64-NEXT:    orq %rdi, %rsi
+; X64-NEXT:    shrq $17, %rdi
+; X64-NEXT:    orq %rsi, %rdi
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
    %srl = lshr i128 %a, 17
@@ -147,27 +118,19 @@
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %esi, %edx
-; X86-NEXT:    shldl $17, %ecx, %esi
-; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    shll $17, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    shldq $17, %rdi, %rsi
-; X64-NEXT:    shlq $17, %rdi
-; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    shlq $17, %rsi
+; X64-NEXT:    orq %rdi, %rsi
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
    %shl = shl i128 %a, 17
@@ -178,27 +141,19 @@
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %esi, %edx
-; X86-NEXT:    shldl $17, %ecx, %esi
-; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    shll $17, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    shldq $17, %rdi, %rsi
-; X64-NEXT:    shlq $17, %rdi
-; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    shlq $17, %rsi
+; X64-NEXT:    orq %rdi, %rsi
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
    %shl = shl i128 %a, 17
@@ -262,27 +217,19 @@
 define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %edx, %esi
-; X86-NEXT:    shldl $17, %ecx, %edx
-; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    shll $17, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_expanded_shl_correct_shifts:
 ; X64:       # %bb.0:
-; X64-NEXT:    shldq $17, %rsi, %rdi
-; X64-NEXT:    shlq $17, %rsi
-; X64-NEXT:    orq %rdi, %rsi
+; X64-NEXT:    shlq $17, %rdi
+; X64-NEXT:    orq %rsi, %rdi
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
   %shl.a = shl i64 %a, 17