Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -398,6 +398,10 @@ COMPRESS, EXPAND, + // XOP arithmetic/logical shifts + VPSHA, + VPSHL, + // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1013,6 +1013,9 @@ setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + if (Subtarget->hasXOP()) + setOperationAction(ISD::SRA, MVT::v2i64, Custom); + if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -16075,7 +16078,7 @@ } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && + if (!Subtarget->is64Bit() && ISD::SRA != Op.getOpcode() && (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { @@ -16224,7 +16227,7 @@ } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && + if (!Subtarget->is64Bit() && ISD::SRA != Op.getOpcode() && (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || (Subtarget->hasAVX512() && VT == MVT::v8i64)) && Amt.getOpcode() == ISD::BITCAST && @@ -16265,8 +16268,10 @@ assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); - if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) - return V; + // XOP can shift v16i8 directly instead of as shift v8i16 + mask. + if (!(VT == MVT::v16i8 && Subtarget->hasXOP())) + if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) + return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; @@ -16288,6 +16293,19 @@ return Op; } + // XOP has 128-bit variable logical/arithmetic shifts. + // +ve/-ve Amt = shift left/right. + if (Subtarget->hasXOP() && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v8i16 || VT == MVT::v16i8)) { + if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) + Amt = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Amt); + if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); + if (Op.getOpcode() == ISD::SRA) + return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); + } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { @@ -17548,6 +17566,8 @@ case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; + case X86ISD::VPSHA: return "X86ISD::VPSHA"; + case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -170,6 +170,13 @@ def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vpshl : SDNode<"X86ISD::VPSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vpsha : SDNode<"X86ISD::VPSHA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; + def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; Index: lib/Target/X86/X86InstrXOP.td =================================================================== --- lib/Target/X86/X86InstrXOP.td +++ lib/Target/X86/X86InstrXOP.td @@ -83,7 +83,42 @@ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; } -multiclass xop3op opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3op opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { + def rr : IXOP, + XOP_4VOp3, Sched<[WriteVarVecShift]>; + def rm : IXOP, + XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; + def mr : IXOP, + XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedInt in { + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; +} + +multiclass xop3op_int opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP; - defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; - defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; - defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; - defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; - defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; - defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; - defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; - defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; - defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; - defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; + defm VPROTW : xop3op_int<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op_int<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op_int<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op_int<0x90, "vprotb", int_x86_xop_vprotb>; } multiclass xop3opimm opc, string OpcodeStr, Intrinsic Int> { Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -584,7 +584,15 @@ X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0) }; /* Index: test/CodeGen/X86/xop-shifts.ll =================================================================== --- test/CodeGen/X86/xop-shifts.ll +++ test/CodeGen/X86/xop-shifts.ll @@ -0,0 +1,386 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+xop | FileCheck %s -check-prefix=CHECK -check-prefix=XOPAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+xop | FileCheck %s -check-prefix=CHECK -check-prefix=XOPAVX2 + +; +; Shift by Scalar Constants +; + +define <16 x i8> @sll_16i8_sc(<16 x i8> %a0) { +; CHECK-LABEL: @sll_16i8_sc +; CHECK: vpshlb + %r = shl <16 x i8> %a0, + ret <16 x i8> %r +} +define <16 x i8> @sra_16i8_sc(<16 x i8> %a0) { +; CHECK-LABEL: @sra_16i8_sc +; CHECK: vpshab + %r = ashr <16 x i8> %a0, + ret <16 x i8> %r +} +define <16 x i8> @srl_16i8_sc(<16 x i8> %a0) { +; CHECK-LABEL: @srl_16i8_sc +; CHECK: vpshlb + %r = lshr <16 x i8> %a0, + ret <16 x i8> %r +} + +define <8 x i16> @sll_8i16_sc(<8 x i16> %a0) { +; CHECK-LABEL: @sll_8i16_sc +; CHECK: vpsllw $5 + %r = shl <8 x i16> %a0, + ret <8 x i16> %r +} +define <8 x i16> @sra_8i16_sc(<8 x i16> %a0) { +; CHECK-LABEL: @sra_8i16_sc +; CHECK: vpsraw $5 + %r = ashr <8 x i16> %a0, + ret <8 x i16> %r +} +define <8 x i16> @srl_8i16_sc(<8 x i16> %a0) { +; CHECK-LABEL: @srl_8i16_sc +; CHECK: vpsrlw $5 + %r = lshr <8 x i16> %a0, + ret <8 x i16> %r +} + +define <4 x i32> @sll_4i32_sc(<4 x i32> %a0) { +; CHECK-LABEL: @sll_4i32_sc +; CHECK: vpslld $7 + %r = shl <4 x i32> %a0, + ret <4 x i32> %r +} +define <4 x i32> @sra_4i32_sc(<4 x i32> %a0) { +; CHECK-LABEL: @sra_4i32_sc +; CHECK: vpsrad $7 + %r = ashr <4 x i32> %a0, + ret <4 x i32> %r +} +define <4 x i32> @srl_4i32_sc(<4 x i32> %a0) { +; CHECK-LABEL: @srl_4i32_sc +; CHECK: vpsrld $7 + %r = lshr <4 x i32> %a0, + ret <4 x i32> %r +} + +define <2 x i64> @sll_2i64_sc(<2 x i64> %a0) { +; CHECK-LABEL: @sll_2i64_sc +; CHECK: vpsllq $11 + %r = shl <2 x i64> %a0, + ret <2 x i64> %r +} +define <2 x i64> @sra_2i64_sc(<2 x i64> %a0) { +; CHECK-LABEL: @sra_2i64_sc +; CHECK: vpshaq + %r = ashr <2 x i64> %a0, + ret <2 x i64> %r +} +define <2 x i64> @srl_2i64_sc(<2 x i64> %a0) { +; CHECK-LABEL: @srl_2i64_sc +; CHECK: vpsrlq $11 + %r = lshr <2 x i64> %a0, + ret <2 x i64> %r +} + +; +; Shift by Vector Constants +; + +define <16 x i8> @sll_16i8_c(<16 x i8> %a0) { +; CHECK-LABEL: @sll_16i8_c +; CHECK: vpshlb + %r = shl <16 x i8> %a0, + ret <16 x i8> %r +} +define <16 x i8> @sra_16i8_c(<16 x i8> %a0) { +; CHECK-LABEL: @sra_16i8_c +; CHECK: vpshab + %r = ashr <16 x i8> %a0, + ret <16 x i8> %r +} +define <16 x i8> @srl_16i8_c(<16 x i8> %a0) { +; CHECK-LABEL: @srl_16i8_c +; CHECK: vpshlb + %r = lshr <16 x i8> %a0, + ret <16 x i8> %r +} + +define <8 x i16> @sll_8i16_c(<8 x i16> %a0) { +; CHECK-LABEL: @sll_8i16_c +; CHECK: vpshlw + %r = shl <8 x i16> %a0, + ret <8 x i16> %r +} +define <8 x i16> @sra_8i16_c(<8 x i16> %a0) { +; CHECK-LABEL: @sra_8i16_c +; CHECK: vpshaw + %r = ashr <8 x i16> %a0, + ret <8 x i16> %r +} +define <8 x i16> @srl_8i16_c(<8 x i16> %a0) { +; CHECK-LABEL: @srl_8i16_c +; CHECK: vpshlw + %r = lshr <8 x i16> %a0, + ret <8 x i16> %r +} + +define <4 x i32> @sll_4i32_c(<4 x i32> %a0) { +; XOPAVX1-LABEL: @sll_4i32_c +; XOPAVX1: vpshld +; XOPAVX2-LABEL: @sll_4i32_c +; XOPAVX2: vpsllvd + %r = shl <4 x i32> %a0, + ret <4 x i32> %r +} +define <4 x i32> @sra_4i32_c(<4 x i32> %a0) { +; XOPAVX1-LABEL: @sra_4i32_c +; XOPAVX1: vpshad +; XOPAVX2-LABEL: @sra_4i32_c +; XOPAVX2: vpsravd + %r = ashr <4 x i32> %a0, + ret <4 x i32> %r +} +define <4 x i32> @srl_4i32_c(<4 x i32> %a0) { +; XOPAVX1-LABEL: @srl_4i32_c +; XOPAVX1: vpshld +; XOPAVX2-LABEL: @srl_4i32_c +; XOPAVX2: vpsrlvd + %r = lshr <4 x i32> %a0, + ret <4 x i32> %r +} + +define <2 x i64> @sll_2i64_c(<2 x i64> %a0) { +; XOPAVX1-LABEL: @sll_2i64_c +; XOPAVX1: vpshlq +; XOPAVX2-LABEL: @sll_2i64_c +; XOPAVX2: vpsllvq + %r = shl <2 x i64> %a0, + ret <2 x i64> %r +} +define <2 x i64> @sra_2i64_c(<2 x i64> %a0) { +; CHECK-LABEL: @sra_2i64_c +; CHECK: vpshaq + %r = ashr <2 x i64> %a0, + ret <2 x i64> %r +} +define <2 x i64> @srl_2i64_c(<2 x i64> %a0) { +; XOPAVX1-LABEL: @srl_2i64_c +; XOPAVX1: vpshlq +; XOPAVX2-LABEL: @srl_2i64_c +; XOPAVX2: vpsrlvq + %r = lshr <2 x i64> %a0, + ret <2 x i64> %r +} + +; +; Shift by Scalar Variable +; + +define <16 x i8> @sll_16i8_s(<16 x i8> %a0, i8 %a1) { +; CHECK-LABEL: @sll_16i8_s +; CHECK: vpshlb + %s = insertelement <16 x i8> undef, i8 %a1, i32 0 + %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer + %r = shl <16 x i8> %a0, %v + ret <16 x i8> %r +} +define <16 x i8> @sra_16i8_s(<16 x i8> %a0, i8 %a1) { +; CHECK-LABEL: @sra_16i8_s +; CHECK: vpxor +; CHECK: vpsubb +; CHECK: vpshab + %s = insertelement <16 x i8> undef, i8 %a1, i32 0 + %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer + %r = ashr <16 x i8> %a0, %v + ret <16 x i8> %r +} +define <16 x i8> @srl_16i8_s(<16 x i8> %a0, i8 %a1) { +; CHECK-LABEL: @srl_16i8_s +; CHECK: vpxor +; CHECK: vpsubb +; CHECK: vpshlb + %s = insertelement <16 x i8> undef, i8 %a1, i32 0 + %v = shufflevector <16 x i8> %s, <16 x i8> %s, <16 x i32> zeroinitializer + %r = lshr <16 x i8> %a0, %v + ret <16 x i8> %r +} + +define <8 x i16> @sll_8i16_s(<8 x i16> %a0, i16 %a1) { +; CHECK-LABEL: @sll_8i16_s +; CHECK: vpsllw + %s = insertelement <8 x i16> undef, i16 %a1, i32 0 + %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer + %r = shl <8 x i16> %a0, %v + ret <8 x i16> %r +} +define <8 x i16> @sra_8i16_s(<8 x i16> %a0, i16 %a1) { +; CHECK-LABEL: @sra_8i16_s +; CHECK: vpsraw + %s = insertelement <8 x i16> undef, i16 %a1, i32 0 + %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer + %r = ashr <8 x i16> %a0, %v + ret <8 x i16> %r +} +define <8 x i16> @srl_8i16_s(<8 x i16> %a0, i16 %a1) { +; CHECK-LABEL: @srl_8i16_s +; CHECK: vpsrlw + %s = insertelement <8 x i16> undef, i16 %a1, i32 0 + %v = shufflevector <8 x i16> %s, <8 x i16> %s, <8 x i32> zeroinitializer + %r = lshr <8 x i16> %a0, %v + ret <8 x i16> %r +} + +define <4 x i32> @sll_4i32_s(<4 x i32> %a0, i32 %a1) { +; CHECK-LABEL: @sll_4i32_s +; CHECK: vpslld + %s = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer + %r = shl <4 x i32> %a0, %v + ret <4 x i32> %r +} +define <4 x i32> @sra_4i32_s(<4 x i32> %a0, i32 %a1) { +; CHECK-LABEL: @sra_4i32_s +; CHECK: vpsrad + %s = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer + %r = ashr <4 x i32> %a0, %v + ret <4 x i32> %r +} +define <4 x i32> @srl_4i32_s(<4 x i32> %a0, i32 %a1) { +; CHECK-LABEL: @srl_4i32_s +; CHECK: vpsrld + %s = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v = shufflevector <4 x i32> %s, <4 x i32> %s, <4 x i32> zeroinitializer + %r = lshr <4 x i32> %a0, %v + ret <4 x i32> %r +} + +define <2 x i64> @sll_2i64_s(<2 x i64> %a0, i64 %a1) { +; CHECK-LABEL: @sll_2i64_s +; CHECK: vpsllq + %s = insertelement <2 x i64> undef, i64 %a1, i32 0 + %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer + %r = shl <2 x i64> %a0, %v + ret <2 x i64> %r +} +define <2 x i64> @sra_2i64_s(<2 x i64> %a0, i64 %a1) { +; CHECK-LABEL: @sra_2i64_s +; CHECK: vpshaq + %s = insertelement <2 x i64> undef, i64 %a1, i32 0 + %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer + %r = ashr <2 x i64> %a0, %v + ret <2 x i64> %r +} +define <2 x i64> @srl_2i64_s(<2 x i64> %a0, i64 %a1) { +; CHECK-LABEL: @srl_2i64_s +; CHECK: vpsrlq + %s = insertelement <2 x i64> undef, i64 %a1, i32 0 + %v = shufflevector <2 x i64> %s, <2 x i64> %s, <2 x i32> zeroinitializer + %r = lshr <2 x i64> %a0, %v + ret <2 x i64> %r +} + +; +; Shift by Vector Variable +; + +define <16 x i8> @sll_16i8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: @sll_16i8 +; CHECK: vpshlb + %r = shl <16 x i8> %a0, %a1 + ret <16 x i8> %r +} +define <16 x i8> @sra_16i8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: @sra_16i8 +; CHECK: vpxor +; CHECK: vpsubb +; CHECK: vpshab + %r = ashr <16 x i8> %a0, %a1 + ret <16 x i8> %r +} +define <16 x i8> @srl_16i8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: @srl_16i8 +; CHECK: vpxor +; CHECK: vpsubb +; CHECK: vpshlb + %r = lshr <16 x i8> %a0, %a1 + ret <16 x i8> %r +} + +define <8 x i16> @sll_8i16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: @sll_8i16 +; CHECK: vpshlw + %r = shl <8 x i16> %a0, %a1 + ret <8 x i16> %r +} +define <8 x i16> @sra_8i16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: @sra_8i16 +; CHECK: vpxor +; CHECK: vpsubw +; CHECK: vpshaw + %r = ashr <8 x i16> %a0, %a1 + ret <8 x i16> %r +} +define <8 x i16> @srl_8i16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: @srl_8i16 +; CHECK: vpxor +; CHECK: vpsubw +; CHECK: vpshlw + %r = lshr <8 x i16> %a0, %a1 + ret <8 x i16> %r +} + +define <4 x i32> @sll_4i32(<4 x i32> %a0, <4 x i32> %a1) { +; XOPAVX1-LABEL: @sll_4i32 +; XOPAVX1: vpshld +; XOPAVX2-LABEL: @sll_4i32 +; XOPAVX2: vpsllvd + %r = shl <4 x i32> %a0, %a1 + ret <4 x i32> %r +} +define <4 x i32> @sra_4i32(<4 x i32> %a0, <4 x i32> %a1) { +; XOPAVX1-LABEL: @sra_4i32 +; XOPAVX1: vpxor +; XOPAVX1: vpsubd +; XOPAVX1: vpshad +; XOPAVX2-LABEL: @sra_4i32 +; XOPAVX2: vpsravd + %r = ashr <4 x i32> %a0, %a1 + ret <4 x i32> %r +} +define <4 x i32> @srl_4i32(<4 x i32> %a0, <4 x i32> %a1) { +; XOPAVX1-LABEL: @srl_4i32 +; XOPAVX1: vpxor +; XOPAVX1: vpsubd +; XOPAVX1: vpshld +; XOPAVX2-LABEL: @srl_4i32 +; XOPAVX2: vpsrlvd + %r = lshr <4 x i32> %a0, %a1 + ret <4 x i32> %r +} + +define <2 x i64> @sll_2i64(<2 x i64> %a0, <2 x i64> %a1) { +; XOPAVX1-LABEL: @sll_2i64 +; XOPAVX1: vpshlq +; XOPAVX2-LABEL: @sll_2i64 +; XOPAVX2: vpsllvq + %r = shl <2 x i64> %a0, %a1 + ret <2 x i64> %r +} +define <2 x i64> @sra_2i64(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: @sra_2i64 +; CHECK: vpxor +; CHECK: vpsubq +; CHECK: vpshaq + %r = ashr <2 x i64> %a0, %a1 + ret <2 x i64> %r +} +define <2 x i64> @srl_2i64(<2 x i64> %a0, <2 x i64> %a1) { +; XOPAVX1-LABEL: @srl_2i64 +; XOPAVX1: vpxor +; XOPAVX1: vpsubq +; XOPAVX1: vpshlq +; XOPAVX2-LABEL: @srl_2i64 +; XOPAVX2: vpsrlvq + %r = lshr <2 x i64> %a0, %a1 + ret <2 x i64> %r +}