Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -398,6 +398,10 @@
       COMPRESS,
       EXPAND,
 
+      // XOP arithmetic/logical shifts
+      VPSHA,
+      VPSHL,
+
       // Save xmm argument registers to the stack, according to %al. An operator
       // is needed so that this can be expanded with control flow.
       VASTART_SAVE_XMM_REGS,
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1013,6 +1013,9 @@
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   }
 
+  if (Subtarget->hasXOP())
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
+
   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
@@ -16075,7 +16078,7 @@
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() &&
+  if (!Subtarget->is64Bit() && ISD::SRA != Op.getOpcode() &&
       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
@@ -16224,7 +16227,7 @@
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() &&
+  if (!Subtarget->is64Bit() && ISD::SRA != Op.getOpcode() &&
       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
       Amt.getOpcode() == ISD::BITCAST &&
@@ -16265,8 +16268,10 @@
   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
 
-  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
-    return V;
+  // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+  if (!(VT == MVT::v16i8 && Subtarget->hasXOP()))
+    if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+      return V;
 
   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
       return V;
@@ -16288,6 +16293,19 @@
       return Op;
   }
 
+  // XOP has 128-bit variable logical/arithmetic shifts.
+  // +ve/-ve Amt = shift left/right.
+  if (Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA)
+      Amt = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Amt);
+    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+    if (Op.getOpcode() == ISD::SRA)
+      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+  }
+
   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   // shifts per-lane and then shuffle the partial results back together.
   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
@@ -17548,6 +17566,8 @@
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
+  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
Index: lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- lib/Target/X86/X86InstrFragmentsSIMD.td
+++ lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -170,6 +170,13 @@
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 
+def X86vpshl   : SDNode<"X86ISD::VPSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vpsha   : SDNode<"X86ISD::VPSHA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
Index: lib/Target/X86/X86InstrXOP.td
===================================================================
--- lib/Target/X86/X86InstrXOP.td
+++ lib/Target/X86/X86InstrXOP.td
@@ -83,7 +83,42 @@
   defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
 }
 
-multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+           XOP_4VOp3, Sched<[WriteVarVecShift]>;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+           XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+  def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+                             (vt128 VR128:$src2))))]>,
+             XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+}
+
+multiclass xop3op_int<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -103,18 +138,10 @@
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+  defm VPROTW : xop3op_int<0x91, "vprotw", int_x86_xop_vprotw>;
+  defm VPROTQ : xop3op_int<0x93, "vprotq", int_x86_xop_vprotq>;
+  defm VPROTD : xop3op_int<0x92, "vprotd", int_x86_xop_vprotd>;
+  defm VPROTB : xop3op_int<0x90, "vprotb", int_x86_xop_vprotb>;
 }
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -584,7 +584,15 @@
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0)
+  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(xop_vpshab,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshad,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaq,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaw,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshlb,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshld,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlq,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlw,        INTR_TYPE_2OP, X86ISD::VPSHL, 0)
 };
 
 /*
Index: test/CodeGen/X86/xop-shifts.ll
===================================================================
--- test/CodeGen/X86/xop-shifts.ll
+++ test/CodeGen/X86/xop-shifts.ll
@@ -0,0 +1,386 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+xop | FileCheck %s -check-prefix=CHECK -check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+xop | FileCheck %s -check-prefix=CHECK -check-prefix=XOPAVX2
+
+;
+; Shift by Scalar Constants
+;
+
+define <16 x i8> @sll_16i8_sc(<16 x i8> %a0) {
+; CHECK-LABEL: @sll_16i8_sc
+; CHECK: vpshlb
+  %r = shl <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+define <16 x i8> @sra_16i8_sc(<16 x i8> %a0) {
+; CHECK-LABEL: @sra_16i8_sc
+; CHECK: vpshab
+  %r = ashr <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+define <16 x i8> @srl_16i8_sc(<16 x i8> %a0) {
+; CHECK-LABEL: @srl_16i8_sc
+; CHECK: vpshlb
+  %r = lshr <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @sll_8i16_sc(<8 x i16> %a0) {
+; CHECK-LABEL: @sll_8i16_sc
+; CHECK: vpsllw $5
+  %r = shl <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %r
+}
+define <8 x i16> @sra_8i16_sc(<8 x i16> %a0) {
+; CHECK-LABEL: @sra_8i16_sc
+; CHECK: vpsraw $5
+  %r = ashr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %r
+}
+define <8 x i16> @srl_8i16_sc(<8 x i16> %a0) {
+; CHECK-LABEL: @srl_8i16_sc
+; CHECK: vpsrlw $5
+  %r = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @sll_4i32_sc(<4 x i32> %a0) {
+; CHECK-LABEL: @sll_4i32_sc
+; CHECK: vpslld $7
+  %r = shl <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %r
+}
+define <4 x i32> @sra_4i32_sc(<4 x i32> %a0) {
+; CHECK-LABEL: @sra_4i32_sc
+; CHECK: vpsrad $7
+  %r = ashr <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %r
+}
+define <4 x i32> @srl_4i32_sc(<4 x i32> %a0) {
+; CHECK-LABEL: @srl_4i32_sc
+; CHECK: vpsrld $7
+  %r = lshr <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @sll_2i64_sc(<2 x i64> %a0) {
+; CHECK-LABEL: @sll_2i64_sc
+; CHECK: vpsllq $11
+  %r = shl <2 x i64> %a0, <i64 11, i64 11>
+  ret <2 x i64> %r
+}
+define <2 x i64> @sra_2i64_sc(<2 x i64> %a0) {
+; CHECK-LABEL: @sra_2i64_sc
+; CHECK: vpshaq
+  %r = ashr <2 x i64> %a0, <i64 11, i64 11>
+  ret <2 x i64> %r
+}
+define <2 x i64> @srl_2i64_sc(<2 x i64> %a0) {
+; CHECK-LABEL: @srl_2i64_sc
+; CHECK: vpsrlq $11
+  %r = lshr <2 x i64> %a0, <i64 11, i64 11>
+  ret <2 x i64> %r
+}
+
+;
+; Shift by Vector Constants
+;
+
+define <16 x i8> @sll_16i8_c(<16 x i8> %a0) {
+; CHECK-LABEL: @sll_16i8_c
+; CHECK: vpshlb
+  %r = shl <16 x i8> %a0, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+define <16 x i8> @sra_16i8_c(<16 x i8> %a0) {
+; CHECK-LABEL: @sra_16i8_c
+; CHECK: vpshab
+  %r = ashr <16 x i8> %a0, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+define <16 x i8> @srl_16i8_c(<16 x i8> %a0) {
+; CHECK-LABEL: @srl_16i8_c
+; CHECK: vpshlb
+  %r = lshr <16 x i8> %a0, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @sll_8i16_c(<8 x i16> %a0) {
+; CHECK-LABEL: @sll_8i16_c
+; CHECK: vpshlw
+  %r = shl <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <8 x i16> %r
+}
+define <8 x i16> @sra_8i16_c(<8 x i16> %a0) {
+; CHECK-LABEL: @sra_8i16_c
+; CHECK: vpshaw
+  %r = ashr <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <8 x i16> %r
+}
+define <8 x i16> @srl_8i16_c(<8 x i16> %a0) {
+; CHECK-LABEL: @srl_8i16_c
+; CHECK: vpshlw
+  %r = lshr <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @sll_4i32_c(<4 x i32> %a0) {
+; XOPAVX1-LABEL: @sll_4i32_c
+; XOPAVX1: vpshld
+; XOPAVX2-LABEL: @sll_4i32_c
+; XOPAVX2: vpsllvd
+  %r = shl <4 x i32> %a0, <i32 15, i32 21, i32 3, i32 7>
+  ret <4 x i32> %r
+}
+define <4 x i32> @sra_4i32_c(<4 x i32> %a0) {
+; XOPAVX1-LABEL: @sra_4i32_c
+; XOPAVX1: vpshad
+; XOPAVX2-LABEL: @sra_4i32_c
+; XOPAVX2: vpsravd
+  %r = ashr <4 x i32> %a0, <i32 15, i32 21, i32 3, i32 7>
+  ret <4 x i32> %r
+}
+define <4 x i32> @srl_4i32_c(<4 x i32> %a0) {
+; XOPAVX1-LABEL: @srl_4i32_c
+; XOPAVX1: vpshld
+; XOPAVX2-LABEL: @srl_4i32_c
+; XOPAVX2: vpsrlvd
+  %r = lshr <4 x i32> %a0, <i32 15, i32 21, i32 3, i32 7>
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @sll_2i64_c(<2 x i64> %a0) {
+; XOPAVX1-LABEL: @sll_2i64_c
+; XOPAVX1: vpshlq
+; XOPAVX2-LABEL: @sll_2i64_c
+; XOPAVX2: vpsllvq
+  %r = shl <2 x i64> %a0, <i64 7, i64 11>
+  ret <2 x i64> %r
+}
+define <2 x i64> @sra_2i64_c(<2 x i64> %a0) {
+; CHECK-LABEL: @sra_2i64_c
+; CHECK: vpshaq
+  %r = ashr <2 x i64> %a0, <i64 7, i64 11>
+  ret <2 x i64> %r
+}
+define <2 x i64> @srl_2i64_c(<2 x i64> %a0) {
+; XOPAVX1-LABEL: @srl_2i64_c
+; XOPAVX1: vpshlq
+; XOPAVX2-LABEL: @srl_2i64_c
+; XOPAVX2: vpsrlvq
+  %r = lshr <2 x i64> %a0, <i64 7, i64 11>
+  ret <2 x i64> %r
+}
+
+;
+; Shift by Scalar Variable
+;
+
+define <16 x i8> @sll_16i8_s(<16 x i8> %a0, i8 %a1) {
+; CHECK-LABEL: @sll_16i8_s
+; CHECK: vpshlb
+  %s = insertelement <16 x i8> undef, i8 %a1, i32 0
+  %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer
+  %r = shl <16 x i8> %a0, %v
+  ret <16 x i8> %r
+}
+define <16 x i8> @sra_16i8_s(<16 x i8> %a0, i8 %a1) {
+; CHECK-LABEL: @sra_16i8_s
+; CHECK: vpxor
+; CHECK: vpsubb
+; CHECK: vpshab
+  %s = insertelement <16 x i8> undef, i8 %a1, i32 0
+  %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer
+  %r = ashr <16 x i8> %a0, %v
+  ret <16 x i8> %r
+}
+define <16 x i8> @srl_16i8_s(<16 x i8> %a0, i8 %a1) {
+; CHECK-LABEL: @srl_16i8_s
+; CHECK: vpxor
+; CHECK: vpsubb
+; CHECK: vpshlb
+  %s = insertelement <16 x i8> undef, i8 %a1, i32 0
+  %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer
+  %r = lshr <16 x i8> %a0, %v
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @sll_8i16_s(<8 x i16> %a0, i16 %a1) {
+; CHECK-LABEL: @sll_8i16_s
+; CHECK: vpsllw
+  %s = insertelement <8 x i16> undef, i16 %a1, i32 0
+  %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer
+  %r = shl <8 x i16> %a0, %v
+  ret <8 x i16> %r
+}
+define <8 x i16> @sra_8i16_s(<8 x i16> %a0, i16 %a1) {
+; CHECK-LABEL: @sra_8i16_s
+; CHECK: vpsraw
+  %s = insertelement <8 x i16> undef, i16 %a1, i32 0
+  %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer
+  %r = ashr <8 x i16> %a0, %v
+  ret <8 x i16> %r
+}
+define <8 x i16> @srl_8i16_s(<8 x i16> %a0, i16 %a1) {
+; CHECK-LABEL: @srl_8i16_s
+; CHECK: vpsrlw
+  %s = insertelement <8 x i16> undef, i16 %a1, i32 0
+  %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer
+  %r = lshr <8 x i16> %a0, %v
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @sll_4i32_s(<4 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @sll_4i32_s
+; CHECK: vpslld
+  %s = insertelement <4 x i32> undef, i32 %a1, i32 0
+  %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer
+  %r = shl <4 x i32> %a0, %v
+  ret <4 x i32> %r
+}
+define <4 x i32> @sra_4i32_s(<4 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @sra_4i32_s
+; CHECK: vpsrad
+  %s = insertelement <4 x i32> undef, i32 %a1, i32 0
+  %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer
+  %r = ashr <4 x i32> %a0, %v
+  ret <4 x i32> %r
+}
+define <4 x i32> @srl_4i32_s(<4 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @srl_4i32_s
+; CHECK: vpsrld
+  %s = insertelement <4 x i32> undef, i32 %a1, i32 0
+  %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer
+  %r = lshr <4 x i32> %a0, %v
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @sll_2i64_s(<2 x i64> %a0, i64 %a1) {
+; CHECK-LABEL: @sll_2i64_s
+; CHECK: vpsllq
+  %s = insertelement <2 x i64> undef, i64 %a1, i32 0
+  %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer
+  %r = shl <2 x i64> %a0, %v
+  ret <2 x i64> %r
+}
+define <2 x i64> @sra_2i64_s(<2 x i64> %a0, i64 %a1) {
+; CHECK-LABEL: @sra_2i64_s
+; CHECK: vpshaq
+  %s = insertelement <2 x i64> undef, i64 %a1, i32 0
+  %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer
+  %r = ashr <2 x i64> %a0, %v
+  ret <2 x i64> %r
+}
+define <2 x i64> @srl_2i64_s(<2 x i64> %a0, i64 %a1) {
+; CHECK-LABEL: @srl_2i64_s
+; CHECK: vpsrlq
+  %s = insertelement <2 x i64> undef, i64 %a1, i32 0
+  %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer
+  %r = lshr <2 x i64> %a0, %v
+  ret <2 x i64> %r
+}
+
+;
+; Shift by Vector Variable
+;
+
+define <16 x i8> @sll_16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: @sll_16i8
+; CHECK: vpshlb
+  %r = shl <16 x i8> %a0, %a1
+  ret <16 x i8> %r
+}
+define <16 x i8> @sra_16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: @sra_16i8
+; CHECK: vpxor
+; CHECK: vpsubb
+; CHECK: vpshab
+  %r = ashr <16 x i8> %a0, %a1
+  ret <16 x i8> %r
+}
+define <16 x i8> @srl_16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: @srl_16i8
+; CHECK: vpxor
+; CHECK: vpsubb
+; CHECK: vpshlb
+  %r = lshr <16 x i8> %a0, %a1
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @sll_8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @sll_8i16
+; CHECK: vpshlw
+  %r = shl <8 x i16> %a0, %a1
+  ret <8 x i16> %r
+}
+define <8 x i16> @sra_8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @sra_8i16
+; CHECK: vpxor
+; CHECK: vpsubw
+; CHECK: vpshaw
+  %r = ashr <8 x i16> %a0, %a1
+  ret <8 x i16> %r
+}
+define <8 x i16> @srl_8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @srl_8i16
+; CHECK: vpxor
+; CHECK: vpsubw
+; CHECK: vpshlw
+  %r = lshr <8 x i16> %a0, %a1
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @sll_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; XOPAVX1-LABEL: @sll_4i32
+; XOPAVX1: vpshld
+; XOPAVX2-LABEL: @sll_4i32
+; XOPAVX2: vpsllvd
+  %r = shl <4 x i32> %a0, %a1
+  ret <4 x i32> %r
+}
+define <4 x i32> @sra_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; XOPAVX1-LABEL: @sra_4i32
+; XOPAVX1: vpxor
+; XOPAVX1: vpsubd
+; XOPAVX1: vpshad
+; XOPAVX2-LABEL: @sra_4i32
+; XOPAVX2: vpsravd
+  %r = ashr <4 x i32> %a0, %a1
+  ret <4 x i32> %r
+}
+define <4 x i32> @srl_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; XOPAVX1-LABEL: @srl_4i32
+; XOPAVX1: vpxor
+; XOPAVX1: vpsubd
+; XOPAVX1: vpshld
+; XOPAVX2-LABEL: @srl_4i32
+; XOPAVX2: vpsrlvd
+  %r = lshr <4 x i32> %a0, %a1
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @sll_2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; XOPAVX1-LABEL: @sll_2i64
+; XOPAVX1: vpshlq
+; XOPAVX2-LABEL: @sll_2i64
+; XOPAVX2: vpsllvq
+  %r = shl <2 x i64> %a0, %a1
+  ret <2 x i64> %r
+}
+define <2 x i64> @sra_2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: @sra_2i64
+; CHECK: vpxor
+; CHECK: vpsubq
+; CHECK: vpshaq
+  %r = ashr <2 x i64> %a0, %a1
+  ret <2 x i64> %r
+}
+define <2 x i64> @srl_2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; XOPAVX1-LABEL: @srl_2i64
+; XOPAVX1: vpxor
+; XOPAVX1: vpsubq
+; XOPAVX1: vpshlq
+; XOPAVX2-LABEL: @srl_2i64
+; XOPAVX2: vpsrlvq
+  %r = lshr <2 x i64> %a0, %a1
+  ret <2 x i64> %r
+}