Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -33128,6 +33128,37 @@
   return Ret;
 }
 
+static SDValue getShiftDoubleNode(SDNode *N, SelectionDAG &DAG, unsigned Opc,
+                                  EVT VT, EVT OpVT, const SDValue &Op0,
+                                  const SDValue &Op1, const SDValue &ShAmt) {
+  SDLoc DL(N);
+  unsigned Bits = OpVT.getScalarSizeInBits();
+  SDValue ShAmtI8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt);
+  if ((VT == MVT::i64 || VT == MVT::i32) && VT == OpVT) {
+    return DAG.getNode(Opc, DL, VT, Op0, Op1, ShAmtI8);
+  } else if (OpVT == MVT::i16) {
+    SDValue Op0I16 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op0);
+    SDValue Op1I16 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op1);
+    SDValue ShDbl = DAG.getNode(Opc, DL, OpVT, Op0I16, Op1I16, ShAmtI8);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ShDbl);
+  } else if (OpVT == MVT::i8) {
+
+    if (Opc == X86ISD::SHLD) {
+      SDValue Op1ShAmt =
+          DAG.getConstant(VT.getSizeInBits() - Bits, DL, MVT::i8);
+      SDValue Op1Shl = DAG.getNode(ISD::SHL, DL, VT, Op1, Op1ShAmt);
+      return DAG.getNode(Opc, DL, VT, Op0, Op1Shl, ShAmtI8);
+    } else if (Opc == X86ISD::SHRD) {
+      SDValue Op0ShAmt =
+          DAG.getConstant(VT.getSizeInBits() - Bits, DL, MVT::i8);
+      SDValue Op0Shl = DAG.getNode(ISD::SHL, DL, VT, Op0, Op0ShAmt);
+      SDValue Op0Shrd = DAG.getNode(Opc, DL, VT, Op0Shl, Op1, ShAmtI8);
+      return DAG.getNode(ISD::SRL, DL, VT, Op0Shrd, Op0ShAmt);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
@@ -33199,21 +33230,53 @@
     SDValue Sum = ShAmt1.getOperand(0);
     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
-      if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
+      if ((ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ||
+          (ShAmt1Op1.getOpcode() == ISD::ANY_EXTEND))
         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
-      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
-        return DAG.getNode(Opc, DL, VT,
-                           Op0, Op1,
-                           DAG.getNode(ISD::TRUNCATE, DL,
-                                       MVT::i8, ShAmt0));
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) {
+        return DAG.getNode(Opc, DL, VT, Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+      }
+      if ((Op0.getOpcode() == ISD::AssertZext &&
+           Op1.getOpcode() == ISD::AssertZext) &&
+          ShAmt1Op1 == ShAmt0) {
+        // Op0 is ZEXT(Y, i16)
+        SDValue Op0Zext = Op0.getOperand(1);
+        VTSDNode *Op0VT = cast<VTSDNode>(Op0Zext);
+        // Op1 is ZEXT(X, i16)
+        SDValue Op1Zext = Op1.getOperand(1);
+        VTSDNode *Op1VT = cast<VTSDNode>(Op1Zext);
+
+        if (Op0VT && Op1VT && (Op0VT->getVT() == Op1VT->getVT()) &&
+            (Op0VT->getVT() == MVT::i16 || Op0VT->getVT() == MVT::i8)) {
+          return getShiftDoubleNode(N, DAG, Opc, VT, Op0VT->getVT(), Op0, Op1,
+                                    ShAmt0);
+        }
+      }
+      if ((Op0.getOpcode() == ISD::LOAD && Op1.getOpcode() == ISD::LOAD) &&
+          ShAmt1Op1 == ShAmt0) {
+        LoadSDNode *Op0Ld = cast<LoadSDNode>(Op0);
+        LoadSDNode *Op1Ld = cast<LoadSDNode>(Op1);
+        EVT Op0LdVT = Op0Ld->getMemoryVT();
+        EVT Op1LdVT = Op1Ld->getMemoryVT();
+
+        // If this is a ZEXTLoad.
+        if (Op0Ld && Op1Ld && ((Op0LdVT == MVT::i8) || (Op0LdVT == MVT::i16)) &&
+            (Op0LdVT == Op1LdVT) &&
+            (ISD::LoadExtType::ZEXTLOAD == Op0Ld->getExtensionType()) &&
+            (ISD::LoadExtType::ZEXTLOAD == Op1Ld->getExtensionType()))
+
+        {
+          return getShiftDoubleNode(N, DAG, Opc, VT, Op0LdVT, Op0, Op1, ShAmt0);
+        }
+      }
+      //-----------------------------------------
     }
   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
-      return DAG.getNode(Opc, DL, VT,
-                         N0.getOperand(0), N1.getOperand(0),
-                         DAG.getNode(ISD::TRUNCATE, DL,
-                                       MVT::i8, ShAmt0));
+      return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
   } else if (ShAmt1.getOpcode() == ISD::XOR) {
     SDValue Mask = ShAmt1.getOperand(1);
     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
@@ -33232,7 +33295,126 @@
         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
             Op1.getOperand(0) == Op1.getOperand(1)) {
           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
-                     DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+                             DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+        }
+      }
+      // ShAmpt1 node can be: XOR( ZEXT(C, i16), MASK)  or it can be: XOR( C,
+      // MASK)
+      SDValue ShAmt1Zext = ShAmt1.getOperand(0);
+      if (ShAmt1Zext.getOpcode() == ISD::TRUNCATE)
+        ShAmt1Zext = ShAmt1Zext.getOperand(0);
+
+      SDValue ShAmt1Const = ShAmt1.getOperand(1);
+      ConstantSDNode *XorMaskConst = dyn_cast<ConstantSDNode>(ShAmt1Const);
+
+      // Op1 node is either SHL( ZEXT(Y, i16), 1) or SRL( ZEXT(Y, i16), 1) or
+      // SHL( LOAD(Y, i16), 1)
+      SDValue Op1Zext = Op1.getOperand(0);
+      SDValue Op1Const = Op1.getOperand(1);
+      ConstantSDNode *OneConst = dyn_cast<ConstantSDNode>(Op1Const);
+
+      // Check for operands that were promoted to integer
+      if ((Op0.getOpcode() == ISD::AssertZext) &&
+          (Op1.getOpcode() == ISD::SHL || Op1.getOpcode() == ISD::SRL) &&
+          XorMaskConst && OneConst && (OneConst->getSExtValue() == 1) &&
+          (Op1Zext.getOpcode() == ISD::AssertZext)) {
+        // Op0 node is ZEXT( X, i16)
+        // SDValue OpX = Op0.getOperand(0);  //  X (bits to shift) operand
+        SDValue Op0Prom =
+            Op0.getOperand(1); //  X operand type before integer promotion
+        VTSDNode *Op0PromVT = cast<VTSDNode>(Op0Prom);
+
+        // ShAmt0 node can be: ZEXT( C, i16) or it can be CopyFromReg(C)
+        EVT ShAmt0VT;
+        SDValue ShAmt0Prom; //  C (shift amount) operand
+        if (ShAmt0.getOpcode() == ISD::AssertZext) {
+          ShAmt0Prom = ShAmt0.getOperand(0); //  C (shift amount) operand
+          SDValue ShAmt0PromType =
+              ShAmt0.getOperand(1); // C operand type before integer promotion
+          VTSDNode *ShAmt0PromVT = cast<VTSDNode>(ShAmt0PromType);
+          ShAmt0VT = ShAmt0PromVT->getVT();
+        } else {
+          ShAmt0Prom = ShAmt0;
+          ShAmt0VT = ShAmt0.getValueType();
+        }
+
+        // Op1Zext is ZEXT(Y, i16)
+        // SDValue OpY = Op1Zext.getOperand(0);  // Y (bits source) operand
+        SDValue Op1Prom =
+            Op1Zext.getOperand(1); // Y operand type before integer promotion
+        VTSDNode *Op1PromVT = cast<VTSDNode>(Op1Prom);
+
+        // ShAmt1Zext node is ZEXT( C, i16) or it can be CopyFromReg(C)
+        EVT ShAmt1VT;
+        SDValue ShAmt1Prom; //  C (shift amount) operand
+        if (ShAmt1Zext.getOpcode() == ISD::AssertZext) {
+          ShAmt1Prom = ShAmt1Zext.getOperand(0); //  C (shift amount) operand
+          SDValue ShAmt1PromType = ShAmt1Zext.getOperand(
+              1); // C operand type before integer promotion
+          VTSDNode *ShAmt1PromVT = cast<VTSDNode>(ShAmt1PromType);
+          ShAmt1VT = ShAmt1PromVT->getVT();
+        } else {
+          ShAmt1Prom = ShAmt1Zext; //  C (shift amount) operand
+          ShAmt1VT = ShAmt1Zext.getValueType();
+        }
+
+        if (Op0PromVT && Op1PromVT &&
+            (Op0PromVT->getVT() == Op1PromVT->getVT()) &&
+            (ShAmt0VT == ShAmt1VT) && (ShAmt0Prom == ShAmt1Prom) &&
+            (Op0PromVT->getVT() == MVT::i16 || Op0PromVT->getVT() == MVT::i8)) {
+          unsigned OpXBits = Op0PromVT->getVT().getSizeInBits();
+          if (XorMaskConst->getSExtValue() == (OpXBits - 1)) {
+            return getShiftDoubleNode(N, DAG, Opc, VT, Op0PromVT->getVT(), Op0,
+                                      Op1Zext, ShAmt0);
+          }
+        }
+      }
+      // Op1 node is either SHL( LOAD(Y, i16), 1) or SRL( LOAD(Y, i16), 1)
+      SDValue Op1Load = Op1.getOperand(0);
+
+      // ShAmpt1 node can be: XOR( LOAD(C, i16), MASK)  or it can be: XOR( C,
+      // MASK)
+      SDValue ShAmt1Load = ShAmt1.getOperand(0);
+      if (ShAmt1Load.getOpcode() == ISD::TRUNCATE)
+        ShAmt1Load = ShAmt1Load.getOperand(0);
+
+      // Check for operands that were loaded from memory and promoted to integer
+      if ((Op0.getOpcode() == ISD::LOAD &&
+           (Op1.getOpcode() == ISD::SHL || Op1.getOpcode() == ISD::SRL) &&
+           XorMaskConst && OneConst && (OneConst->getSExtValue() == 1) &&
+           (Op1Load.getOpcode() == ISD::LOAD))) {
+        // Op0 node is LOAD( X, i16)
+        LoadSDNode *Op0Ld = cast<LoadSDNode>(Op0);
+        EVT Op0VT = Op0Ld->getMemoryVT();
+
+        // ShAmt0 node can be: LOAD(C, zext i16) or it can be CopyFromReg(C)
+        EVT ShAmt0VT;
+        if (ShAmt0.getOpcode() == ISD::LOAD) {
+          LoadSDNode *ShAmt0Ld = cast<LoadSDNode>(ShAmt0);
+          ShAmt0VT = ShAmt0Ld->getMemoryVT();
+        } else {
+          ShAmt0VT = ShAmt0.getValueType();
+        }
+        // Op1Load is LOAD(Y, i16)
+        LoadSDNode *Op1Ld = cast<LoadSDNode>(Op1Load);
+        EVT Op1VT = Op1Ld->getMemoryVT();
+
+        // ShAmt1Load node is LOAD(C, zext i16) or it can be CopyFromReg(C)
+        EVT ShAmt1VT;
+        if (ShAmt1Load.getOpcode() == ISD::LOAD) {
+          LoadSDNode *ShAmt1Ld = cast<LoadSDNode>(ShAmt1Load);
+          ShAmt1VT = ShAmt1Ld->getMemoryVT();
+        } else {
+          ShAmt1VT = ShAmt1Load.getValueType();
+        }
+
+        if ((Op0VT == Op1VT) && (ShAmt0VT == ShAmt1VT) &&
+            (ShAmt0 == ShAmt1Load) && (Op0VT == MVT::i16 || Op0VT == MVT::i8)) {
+          unsigned Op0Bits = Op0VT.getSizeInBits();
+          if (XorMaskConst->getSExtValue() == (Op0Bits - 1)) {
+            return getShiftDoubleNode(N, DAG, Opc, VT, Op0VT, Op0, Op1Load,
+                                      ShAmt0);
+          }
         }
       }
     }
Index: test/CodeGen/X86/shift-double-x86_64.ll
===================================================================
--- test/CodeGen/X86/shift-double-x86_64.ll
+++ test/CodeGen/X86/shift-double-x86_64.ll
@@ -107,3 +107,273 @@
   %sh = or i64 %sh_lo, %sh_hi
   ret i64 %sh
 }
+
+;-------------------------------------------------------------------------------------
+; double shift left pattern
+;uint_t shld(uint_t a, uint_t b, int shift)
+;{
+;  return (a << shift) | (b >> (sizeof(uint_t)*8 - shift));
+;}
+
+define i64 @shld64_sh64(i64 %a, i64 %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld64_sh64:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+
+  %shl = shl i64 %a, %bits
+  %sub = sub i64 64, %bits
+  %shr = lshr i64 %b, %sub
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+define i64 @shld64_sh32(i64 %a, i64 %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld64_sh32:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+
+  %sh_prom = zext i32 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shr = lshr i64 %b, %sub
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+define i64 @shld64_sh16(i64 %a, i64 %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld64_sh16:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+
+  %sh_prom = zext i16 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shr = lshr i64 %b, %sub
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+define i64 @shld64_sh8(i64 %a, i64 %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld64_sh8:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+
+  %sh_prom = zext i8 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shr = lshr i64 %b, %sub
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;-------------------------------------------------------------------------------------
+; double shift right pattern
+;uint_t shrd(uint_t a, uint_t b, int shift)
+;{
+;  return (a >> shift) | (b << (  sizeof(uint_t)*8 - shift));
+;}
+
+
+define i64 @shrd64_sh64(i64 %a, i64 %b, i64 %bits) nounwind {
+; CHECK-LABEL: shrd64_sh64:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %shr = lshr i64 %a, %bits
+  %sub = sub i64 64, %bits
+  %shl = shl i64 %b, %sub
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64_sh32(i64 %a, i64 %b, i32 %bits) nounwind {
+; CHECK-LABEL: shrd64_sh32:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i32 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shl = shl i64 %b, %sub
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64_sh16(i64 %a, i64 %b, i16 zeroext %bits) nounwind {
+; CHECK-LABEL: shrd64_sh16:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i16 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shl = shl i64 %b, %sub
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64_sh8(i64 %a, i64 %b, i8 zeroext %bits) nounwind {
+; CHECK-LABEL: shrd64_sh8:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i8 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i64 64, %sh_prom
+  %shl = shl i64 %b, %sub
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+;-------------------------------------------------------------------------------------
+; double shift left with xor pattern
+;uint64_t shldx(uint64_t a, uint64_t b, shift_t bits)
+;{
+;  return (a << bits) | ((b >> 1) >> (bits ^ (sizeof(a)*8 - 1)));
+;}
+
+define i64 @shld64x_sh64(i64 %a, i64 %b, i64 %bits) nounwind {
+; CHECK-LABEL: shld64x_sh64:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %shl = shl i64 %a, %bits
+  %shr = lshr i64 %b, 1
+  %xor = xor i64 %bits, 63
+  %shr1 = lshr i64 %shr, %xor
+  %or = or i64 %shr1, %shl
+  ret i64 %or
+}
+
+define i64 @shld64x_sh32(i64 %a, i64 %b, i32 %bits) nounwind {
+; CHECK-LABEL: shld64x_sh32:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i32 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %shr = lshr i64 %b, 1
+  %xor = xor i64 %sh_prom, 63
+  %shr1 = lshr i64 %shr, %xor
+  %or = or i64 %shr1, %shl
+  ret i64 %or
+}
+
+define i64 @shld64x_sh16(i64 %a, i64 %b, i16 zeroext %bits) nounwind {
+; CHECK-LABEL: shld64x_sh16:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i16 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %shr = lshr i64 %b, 1
+  %xor0 = xor i16 %bits, 63
+  %xor = zext i16 %xor0 to i64
+  %shr2 = lshr i64 %shr, %xor
+  %or = or i64 %shr2, %shl
+  ret i64 %or
+}
+
+define i64 @shld64x_sh8(i64 %a, i64 %b, i8 zeroext %bits) nounwind {
+; CHECK-LABEL: shld64x_sh8:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i8 %bits to i64
+  %shl = shl i64 %a, %sh_prom
+  %shr = lshr i64 %b, 1
+  %xor0 = xor i8 %bits, 63
+  %xor = zext i8 %xor0 to i64
+  %shr2 = lshr i64 %shr, %xor
+  %or = or i64 %shr2, %shl
+  ret i64 %or
+}
+
+;-------------------------------------------------------------------------------------
+; double shift right with xor pattern
+;uint64_t shrdx(uint64_t a, uint64_t b, shift_t bits)
+;{
+;  return (a >> bits) | ((b << 1) << (bits ^ (sizeof(a)*8 - 1)));
+;}
+
+define i64 @shrd64x_sh64(i64 %a, i64 %b, i64 %bits) nounwind {
+; CHECK-LABEL: shrd64x_sh64:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %shr = lshr i64 %a, %bits
+  %shl = shl i64 %b, 1
+  %xor = xor i64 %bits, 63
+  %shl1 = shl i64 %shl, %xor
+  %or = or i64 %shl1, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64x_sh32(i64 %a, i64 %b, i32 %bits) nounwind {
+; CHECK-LABEL: shrd64x_sh32:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i32 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %shl = shl i64 %b, 1
+  %xor = xor i64 %sh_prom, 63
+  %shl1 = shl i64 %shl, %xor
+  %or = or i64 %shl1, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64x_sh16(i64 %a, i64 %b, i16 zeroext %bits) nounwind {
+; CHECK-LABEL: shrd64x_sh16:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i16 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %shl = shl i64 %b, 1
+  %xor0 = xor i16 %bits, 63
+  %xor = zext i16 %xor0 to i64
+  %shl2 = shl i64 %shl, %xor
+  %or = or i64 %shl2, %shr
+  ret i64 %or
+}
+
+define i64 @shrd64x_sh8(i64 %a, i64 %b, i8 zeroext %bits) nounwind {
+; CHECK-LABEL: shrd64x_sh8:
+; CHECK:	 movl %edx, %ecx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK:	 retq
+  %sh_prom = zext i8 %bits to i64
+  %shr = lshr i64 %a, %sh_prom
+  %shl = shl i64 %b, 1
+  %xor0 = xor i8 %bits, 63
+  %xor = zext i8 %xor0 to i64
+  %shl2 = shl i64 %shl, %xor
+  %or = or i64 %shl2, %shr
+  ret i64 %or
+}
+
Index: test/CodeGen/X86/shift-double.ll
===================================================================
--- test/CodeGen/X86/shift-double.ll
+++ test/CodeGen/X86/shift-double.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
 
 ; Shift i64 integers on 32-bit target
 
@@ -310,3 +311,1310 @@
   %sh = or i32 %sh_lo, %sh_hi
   ret i32 %sh
 }
+
+
+;-------------------------------------------------------------------------------------
+; double shift left pattern
+;uint_t shld(uint_t a, uint_t b, int shift)
+;{
+;  return (a << shift) | (b >> (sizeof(uint_t)*8 - shift));
+;}
+
+define i32 @shld32_sh64(i32 %a, i32 %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld32_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32_sh64:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %a, %sh_prom
+  %sub = sub i64 32, %bits
+  %sh_prom1 = trunc i64 %sub to i32
+  %shr = lshr i32 %b, %sh_prom1
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+define i32 @shld32_sh32(i32 %a, i32 %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld32_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32_sh32:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %shl = shl i32 %a, %bits
+  %sub = sub i32 32, %bits
+  %shr = lshr i32 %b, %sub
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+define i32 @shld32_sh16(i32 %a, i32 %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld32_sh16:
+; CHECK:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32_sh16:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %bits to i32
+  %shl = shl i32 %a, %conv
+  %sub = sub nsw i32 32, %conv
+  %shr = lshr i32 %b, %sub
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+define i32 @shld32_sh8(i32 %a, i32 %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld32_sh8:
+; CHECK:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32_sh8:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %bits to i32
+  %shl = shl i32 %a, %conv
+  %sub = sub nsw i32 32, %conv
+  %shr = lshr i32 %b, %sub
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+define zeroext i16 @shld16_sh64(i16 zeroext %a, i16 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld16_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16_sh64:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %conv, %sh_prom
+  %conv1 = zext i16 %b to i32
+  %sub = sub i64 16, %bits
+  %sh_prom2 = trunc i64 %sub to i32
+  %shr = lshr i32 %conv1, %sh_prom2
+  %or = or i32 %shr, %shl
+  %conv3 = trunc i32 %or to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @shld16_sh32(i16 zeroext %a, i16 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld16_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16_sh32:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %shl = shl i32 %conv, %bits
+  %conv1 = zext i16 %b to i32
+  %sub = sub i32 16, %bits
+  %shr = lshr i32 %conv1, %sub
+  %or = or i32 %shr, %shl
+  %conv3 = trunc i32 %or to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @shld16_sh16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld16_sh16:
+; CHECK:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16_sh16:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %sub = sub nsw i32 16, %conv1
+  %shr = lshr i32 %conv2, %sub
+  %or = or i32 %shr, %shl
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shld16_sh8(i16 zeroext %a, i16 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld16_sh8:
+; CHECK:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16_sh8:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %sub = sub nsw i32 16, %conv1
+  %shr = lshr i32 %conv2, %sub
+  %or = or i32 %shr, %shl
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i8 @shld8_sh64(i8 zeroext %a, i8 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld8_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8_sh64:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %conv, %sh_prom
+  %conv1 = zext i8 %b to i32
+  %sub = sub i64 8, %bits
+  %sh_prom2 = trunc i64 %sub to i32
+  %shr = lshr i32 %conv1, %sh_prom2
+  %or = or i32 %shr, %shl
+  %conv3 = trunc i32 %or to i8
+  ret i8 %conv3
+}
+
+define zeroext i8 @shld8_sh32(i8 zeroext %a, i8 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld8_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8_sh32:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, %bits
+  %conv1 = zext i8 %b to i32
+  %sub = sub i32 8, %bits
+  %shr = lshr i32 %conv1, %sub
+  %or = or i32 %shr, %shl
+  %conv3 = trunc i32 %or to i8
+  ret i8 %conv3
+}
+
+define zeroext i8 @shld8_sh16(i8 zeroext %a, i8 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld8_sh16:
+; CHECK:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8_sh16:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %sub = sub nsw i32 8, %conv1
+  %shr = lshr i32 %conv2, %sub
+  %or = or i32 %shr, %shl
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shld8_sh8(i8 zeroext %a, i8 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld8_sh8:
+; CHECK:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8_sh8:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %sub = sub nsw i32 8, %conv1
+  %shr = lshr i32 %conv2, %sub
+  %or = or i32 %shr, %shl
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+;-------------------------------------------------------------------------------------
+; double shift right pattern
+;uint_t shrd(uint_t a, uint_t b, int shift)
+;{
+;  return (a >> shift) | (b << (  sizeof(uint_t)*8 - shift));
+;}
+
+define i32 @shrd32_sh64(i32 %a, i32 %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd32_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd32_sh64:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %a, %sh_prom
+  %sub = sub i64 32, %bits
+  %sh_prom1 = trunc i64 %sub to i32
+  %shl = shl i32 %b, %sh_prom1
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32_sh32(i32 %a, i32 %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd32_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd32_sh32:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %shr = lshr i32 %a, %bits
+  %sub = sub i32 32, %bits
+  %shl = shl i32 %b, %sub
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32_sh16(i32 %a, i32 %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd32_sh16:
+; CHECK:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd32_sh16:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %bits to i32
+  %shr = lshr i32 %a, %conv
+  %sub = sub nsw i32 32, %conv
+  %shl = shl i32 %b, %sub
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32_sh8(i32 %a, i32 %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd32_sh8:
+; CHECK:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd32_sh8:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %bits to i32
+  %shr = lshr i32 %a, %conv
+  %sub = sub nsw i32 32, %conv
+  %shl = shl i32 %b, %sub
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define zeroext i16 @shrd16_sh64(i16 zeroext %a, i16 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd16_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16_sh64:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %conv, %sh_prom
+  %conv1 = zext i16 %b to i32
+  %sub = sub i64 16, %bits
+  %sh_prom2 = trunc i64 %sub to i32
+  %shl = shl i32 %conv1, %sh_prom2
+  %or = or i32 %shl, %shr
+  %conv3 = trunc i32 %or to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @shrd16_sh32(i16 zeroext %a, i16 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd16_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16_sh32:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %shr = lshr i32 %conv, %bits
+  %conv1 = zext i16 %b to i32
+  %sub = sub i32 16, %bits
+  %shl = shl i32 %conv1, %sub
+  %or = or i32 %shl, %shr
+  %conv3 = trunc i32 %or to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @shrd16_sh16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd16_sh16:
+; CHECK:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16_sh16:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %sub = sub nsw i32 16, %conv1
+  %shl = shl i32 %conv2, %sub
+  %or = or i32 %shl, %shr
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shrd16_sh8(i16 zeroext %a, i16 zeroext %b, i8 zeroext %bits) nounwind {
+; CHECK-LABEL: shrd16_sh8:
+; CHECK:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16_sh8:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %sub = sub nsw i32 16, %conv1
+  %shl = shl i32 %conv2, %sub
+  %or = or i32 %shl, %shr
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i8 @shrd8_sh64(i8 zeroext %a, i8 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd8_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8_sh64:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %conv, %sh_prom
+  %conv1 = zext i8 %b to i32
+  %sub = sub i64 8, %bits
+  %sh_prom2 = trunc i64 %sub to i32
+  %shl = shl i32 %conv1, %sh_prom2
+  %or = or i32 %shl, %shr
+  %conv3 = trunc i32 %or to i8
+  ret i8 %conv3
+}
+
+define zeroext i8 @shrd8_sh32(i8 zeroext %a, i8 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd8_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8_sh32:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %shr = lshr i32 %conv, %bits
+  %conv1 = zext i8 %b to i32
+  %sub = sub i32 8, %bits
+  %shl = shl i32 %conv1, %sub
+  %or = or i32 %shl, %shr
+  %conv3 = trunc i32 %or to i8
+  ret i8 %conv3
+}
+
+define zeroext i8 @shrd8_sh16(i8 zeroext %a, i8 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd8_sh16:
+; CHECK:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8_sh16:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %sub = sub nsw i32 8, %conv1
+  %shl = shl i32 %conv2, %sub
+  %or = or i32 %shl, %shr
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shrd8_sh8(i8 zeroext %a, i8 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd8_sh8:
+; CHECK:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8_sh8:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %sub = sub nsw i32 8, %conv1
+  %shl = shl i32 %conv2, %sub
+  %or = or i32 %shl, %shr
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+;-------------------------------------------------------------------------------------
+; double shift left with xor pattern
+;uint64_t shldx(uint64_t a, uint64_t b, shift_t bits)
+;{
+;  return (a << bits) | ((b >> 1) >> (bits ^ (sizeof(a)*8 - 1)));
+;}
+
+
+define i32 @shld32x_sh64(i32 %a, i32 %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld32x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32x_sh64:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %a, %sh_prom
+  %shr = lshr i32 %b, 1
+  %sh_prom1 = xor i32 %sh_prom, 31
+  %shr2 = lshr i32 %shr, %sh_prom1
+  %or = or i32 %shr2, %shl
+  ret i32 %or
+}
+
+define i32 @shld32x_sh32(i32 %a, i32 %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld32x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32x_sh32:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %shl = shl i32 %a, %bits
+  %shr = lshr i32 %b, 1
+  %xor0 = xor i32 %bits, 31
+  %shr1 = lshr i32 %shr, %xor0
+  %or = or i32 %shr1, %shl
+  ret i32 %or
+}
+
+define i32 @shld32x_sh16(i32 %a, i32 %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld32x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32x_sh16:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %bits to i32
+  %shl = shl i32 %a, %conv
+  %shr = lshr i32 %b, 1
+  %xor0 = xor i16 %bits, 31
+  %sh_prom = zext i16 %xor0 to i32
+  %shr2 = lshr i32 %shr, %sh_prom
+  %or = or i32 %shr2, %shl
+  ret i32 %or
+}
+
+define i32 @shld32x_sh8(i32 %a, i32 %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld32x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld32x_sh8:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %bits to i32
+  %shl = shl i32 %a, %conv
+  %shr = lshr i32 %b, 1
+  %xor0 = xor i8 %bits, 31
+  %sh_prom = zext i8 %xor0 to i32
+  %shr2 = lshr i32 %shr, %sh_prom
+  %or = or i32 %shr2, %shl
+  ret i32 %or
+}
+
+define zeroext i16 @shld16x_sh64(i16 zeroext %a, i16 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld16x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16x_sh64:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %conv, %sh_prom
+  %conv1 = zext i16 %b to i32
+  %lshr0 = lshr i32 %conv1, 1
+  %sh_prom2 = xor i32 %sh_prom, 15
+  %shr3 = lshr i32 %lshr0, %sh_prom2
+  %or = or i32 %shr3, %shl
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shld16x_sh32(i16 zeroext %a, i16 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld16x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16x_sh32:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %shl = shl i32 %conv, %bits
+  %conv1 = zext i16 %b to i32
+  %lshr0 = lshr i32 %conv1, 1
+  %xor1 = xor i32 %bits, 15
+  %shr3 = lshr i32 %lshr0, %xor1
+  %or = or i32 %shr3, %shl
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shld16x_sh16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld16x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16x_sh16:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %lshr0 = lshr i32 %conv2, 1
+  %xor1 = xor i16 %bits, 15
+  %sh_prom = zext i16 %xor1 to i32
+  %shr4 = lshr i32 %lshr0, %sh_prom
+  %or = or i32 %shr4, %shl
+  %conv5 = trunc i32 %or to i16
+  ret i16 %conv5
+}
+
+define zeroext i16 @shld16x_sh8(i16 zeroext %a, i16 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld16x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shldw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld16x_sh8:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shldw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %lshr0 = lshr i32 %conv2, 1
+  %xor1 = xor i8 %bits, 15
+  %sh_prom = zext i8 %xor1 to i32
+  %shr4 = lshr i32 %lshr0, %sh_prom
+  %or = or i32 %shr4, %shl
+  %conv5 = trunc i32 %or to i16
+  ret i16 %conv5
+}
+
+define zeroext i8 @shld8x_sh64(i8 zeroext %a, i8 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shld8x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8x_sh64:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shl = shl i32 %conv, %sh_prom
+  %conv1 = zext i8 %b to i32
+  %lshr0 = lshr i32 %conv1, 1
+  %sh_prom2 = xor i32 %sh_prom, 7
+  %shr3 = lshr i32 %lshr0, %sh_prom2
+  %or = or i32 %shr3, %shl
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shld8x_sh32(i8 zeroext %a, i8 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shld8x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8x_sh32:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, %bits
+  %conv1 = zext i8 %b to i32
+  %lshr0 = lshr i32 %conv1, 1
+  %xor1 = xor i32 %bits, 7
+  %shr3 = lshr i32 %lshr0, %xor1
+  %or = or i32 %shr3, %shl
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shld8x_sh16(i8 zeroext %a, i8 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld8x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8x_sh16:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %lshr0 = lshr i32 %conv2, 1
+  %xor1 = xor i16 %bits, 7
+  %sh_prom = zext i16 %xor1 to i32
+  %shr4 = lshr i32 %lshr0, %sh_prom
+  %or = or i32 %shr4, %shl
+  %conv5 = trunc i32 %or to i8
+  ret i8 %conv5
+}
+
+define zeroext i8 @shld8x_sh8(i8 zeroext %a, i8 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shld8x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %edx
+; CHECK-NEXT:	shldl	%cl, %edx, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shld8x_sh8:
+; CHECK64:	   shll $24, %esi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shldl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shl = shl i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %lshr0 = lshr i32 %conv2, 1
+  %xor1 = xor i8 %bits, 7
+  %sh_prom = zext i8 %xor1 to i32
+  %shr4 = lshr i32 %lshr0, %sh_prom
+  %or = or i32 %shr4, %shl
+  %conv5 = trunc i32 %or to i8
+  ret i8 %conv5
+}
+
+;-------------------------------------------------------------------------------------
+; double shift right with xor pattern
+;uint64_t shrdx(uint64_t a, uint64_t b, shift_t bits)
+;{
+;  return (a >> bits) | ((b << 1) << (bits ^ (sizeof(a)*8 - 1)));
+;}
+
+define i32 @shrd32x_sh64(i32 %a, i32 %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd32x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	retl
+
+; CHECK64-LABEL: shrd32x_sh64:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %a, %sh_prom
+  %shl = shl i32 %b, 1
+  %sh_prom1 = xor i32 %sh_prom, 31
+  %shl2 = shl i32 %shl, %sh_prom1
+  %or = or i32 %shl2, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32x_sh32(i32 %a, i32 %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd32x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	retl
+
+; CHECK64-LABEL: shrd32x_sh32:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %shr = lshr i32 %a, %bits
+  %shl = shl i32 %b, 1
+  %xor0 = xor i32 %bits, 31
+  %shl1 = shl i32 %shl, %xor0
+  %or = or i32 %shl1, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32x_sh16(i32 %a, i32 %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd32x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	retl
+
+; CHECK64-LABEL: shrd32x_sh16:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %bits to i32
+  %shr = lshr i32 %a, %conv
+  %shl = shl i32 %b, 1
+  %xor0 = xor i16 %bits, 31
+  %sh_prom = zext i16 %xor0 to i32
+  %shl2 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl2, %shr
+  ret i32 %or
+}
+
+define i32 @shrd32x_sh8(i32 %a, i32 %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd32x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	movl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	retl
+
+; CHECK64-LABEL: shrd32x_sh8:
+; CHECK64:	   movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %bits to i32
+  %shr = lshr i32 %a, %conv
+  %shl = shl i32 %b, 1
+  %xor0 = xor i8 %bits, 31
+  %sh_prom = zext i8 %xor0 to i32
+  %shl2 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl2, %shr
+  ret i32 %or
+}
+
+define zeroext i16 @shrd16x_sh64(i16 zeroext %a, i16 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd16x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16x_sh64:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %conv, %sh_prom
+  %conv1 = zext i16 %b to i32
+  %shl = shl nuw nsw i32 %conv1, 1
+  %sh_prom2 = xor i32 %sh_prom, 15
+  %shl3 = shl i32 %shl, %sh_prom2
+  %or = or i32 %shl3, %shr
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shrd16x_sh32(i16 zeroext %a, i16 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd16x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16x_sh32:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %shr = lshr i32 %conv, %bits
+  %conv1 = zext i16 %b to i32
+  %shl = shl nuw nsw i32 %conv1, 1
+  %xor0 = xor i32 %bits, 15
+  %shl3 = shl i32 %shl, %xor0
+  %or = or i32 %shl3, %shr
+  %conv4 = trunc i32 %or to i16
+  ret i16 %conv4
+}
+
+define zeroext i16 @shrd16x_sh16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd16x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16x_sh16:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %shl = shl nuw nsw i32 %conv2, 1
+  %xor0 = xor i16 %bits, 15
+  %sh_prom = zext i16 %xor0 to i32
+  %shl4 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl4, %shr
+  %conv5 = trunc i32 %or to i16
+  ret i16 %conv5
+}
+
+define zeroext i16 @shrd16x_sh8(i16 zeroext %a, i16 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd16x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzwl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shrdw	%cl, %dx, %ax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd16x_sh8:
+; CHECK64:	   movl	%edx, %ecx
+; CHECK64-NEXT:    shrdw	%cl, %si, %di
+; CHECK64-NEXT:    movl	%edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i16 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i16 %b to i32
+  %shl = shl nuw nsw i32 %conv2, 1
+  %xor0 = xor i8 %bits, 15
+  %sh_prom = zext i8 %xor0 to i32
+  %shl4 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl4, %shr
+  %conv5 = trunc i32 %or to i16
+  ret i16 %conv5
+}
+
+define zeroext i8 @shrd8x_sh64(i8 zeroext %a, i8 zeroext %b, i64 %bits) nounwind {
+
+; CHECK-LABEL: shrd8x_sh64:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8x_sh64:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %sh_prom = trunc i64 %bits to i32
+  %shr = lshr i32 %conv, %sh_prom
+  %conv1 = zext i8 %b to i32
+  %shl = shl nuw nsw i32 %conv1, 1
+  %sh_prom2 = xor i32 %sh_prom, 7
+  %shl3 = shl i32 %shl, %sh_prom2
+  %or = or i32 %shl3, %shr
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shrd8x_sh32(i8 zeroext %a, i8 zeroext %b, i32 %bits) nounwind {
+
+; CHECK-LABEL: shrd8x_sh32:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8x_sh32:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %shr = lshr i32 %conv, %bits
+  %conv1 = zext i8 %b to i32
+  %shl = shl nuw nsw i32 %conv1, 1
+  %xor0 = xor i32 %bits, 7
+  %shl3 = shl i32 %shl, %xor0
+  %or = or i32 %shl3, %shr
+  %conv4 = trunc i32 %or to i8
+  ret i8 %conv4
+}
+
+define zeroext i8 @shrd8x_sh16(i8 zeroext %a, i8 zeroext %b, i16 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd8x_sh16:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8x_sh16:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i16 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %shl = shl nuw nsw i32 %conv2, 1
+  %xor0 = xor i16 %bits, 7
+  %sh_prom = zext i16 %xor0 to i32
+  %shl4 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl4, %shr
+  %conv5 = trunc i32 %or to i8
+  ret i8 %conv5
+}
+
+define zeroext i8 @shrd8x_sh8(i8 zeroext %a, i8 zeroext %b, i8 zeroext %bits) nounwind {
+
+; CHECK-LABEL: shrd8x_sh8:
+; CHECK:	movb	{{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:	movzbl	{{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:	shll	$24, %eax
+; CHECK-NEXT:	shrdl	%cl, %edx, %eax
+; CHECK-NEXT:	shrl	$24, %eax
+; CHECK:	retl
+
+; CHECK64-LABEL: shrd8x_sh8:
+; CHECK64:	   shll $24, %edi
+; CHECK64-NEXT:    movl %edx, %ecx
+; CHECK64-NEXT:    shrdl %cl, %esi, %edi
+; CHECK64-NEXT:    shrl $24, %edi
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64:	   retq
+
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %bits to i32
+  %shr = lshr i32 %conv, %conv1
+  %conv2 = zext i8 %b to i32
+  %shl = shl nuw nsw i32 %conv2, 1
+  %xor0 = xor i8 %bits, 7
+  %sh_prom = zext i8 %xor0 to i32
+  %shl4 = shl i32 %shl, %sh_prom
+  %or = or i32 %shl4, %shr
+  %conv5 = trunc i32 %or to i8
+  ret i8 %conv5
+}