Skip to content

Commit 3d11c99

Browse files
committedSep 30, 2015
[X86][XOP] Added support for the lowering of 128-bit vector shifts to XOP shift instructions
The XOP shifts just have logical/arithmetic versions and the left/right shifts are controlled by whether the value is positive/negative. Because of this I've added new X86ISD nodes instead of trying to force them to use the existing shift nodes. Additionally Excavator cores (bdver4) support XOP and AVX2 - meaning that it should use the AVX2 shifts when it can and fall back to XOP in other cases. Differential Revision: http://reviews.llvm.org/D8690 llvm-svn: 248878
1 parent 82d705e commit 3d11c99

13 files changed

+1391
-46
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17893,18 +17893,28 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
1789317893

1789417894
// i64 SRA needs to be performed as partial shifts.
1789517895
if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
17896-
Op.getOpcode() == ISD::SRA)
17896+
Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
1789717897
return ArithmeticShiftRight64(ShiftAmt);
1789817898

1789917899
if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
1790017900
unsigned NumElts = VT.getVectorNumElements();
1790117901
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
1790217902

17903-
if (Op.getOpcode() == ISD::SHL) {
17904-
// Simple i8 add case
17905-
if (ShiftAmt == 1)
17906-
return DAG.getNode(ISD::ADD, dl, VT, R, R);
17903+
// Simple i8 add case
17904+
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
17905+
return DAG.getNode(ISD::ADD, dl, VT, R, R);
17906+
17907+
// ashr(R, 7) === cmp_slt(R, 0)
17908+
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
17909+
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
17910+
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
17911+
}
1790717912

17913+
// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
17914+
if (VT == MVT::v16i8 && Subtarget->hasXOP())
17915+
return SDValue();
17916+
17917+
if (Op.getOpcode() == ISD::SHL) {
1790817918
// Make a large shift.
1790917919
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
1791017920
R, ShiftAmt, DAG);
@@ -17927,12 +17937,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
1792717937
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
1792817938
}
1792917939
if (Op.getOpcode() == ISD::SRA) {
17930-
if (ShiftAmt == 7) {
17931-
// ashr(R, 7) === cmp_slt(R, 0)
17932-
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
17933-
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
17934-
}
17935-
1793617940
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
1793717941
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
1793817942
SmallVector<SDValue, 32> V(NumElts,
@@ -17949,7 +17953,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
1794917953
}
1795017954

1795117955
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
17952-
if (!Subtarget->is64Bit() &&
17956+
if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
1795317957
(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
1795417958

1795517959
// Peek through any splat that was introduced for i64 shift vectorization.
@@ -18103,11 +18107,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
1810318107
return V;
1810418108

1810518109
if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
18106-
return V;
18110+
return V;
1810718111

1810818112
if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
1810918113
return Op;
1811018114

18115+
// XOP has 128-bit variable logical/arithmetic shifts.
18116+
// +ve/-ve Amt = shift left/right.
18117+
if (Subtarget->hasXOP() &&
18118+
(VT == MVT::v2i64 || VT == MVT::v4i32 ||
18119+
VT == MVT::v8i16 || VT == MVT::v16i8)) {
18120+
if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
18121+
SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
18122+
Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
18123+
}
18124+
if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
18125+
return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
18126+
if (Op.getOpcode() == ISD::SRA)
18127+
return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
18128+
}
18129+
1811118130
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
1811218131
// shifts per-lane and then shuffle the partial results back together.
1811318132
if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
@@ -18296,7 +18315,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
1829618315
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
1829718316
}
1829818317

18299-
if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
18318+
if (VT == MVT::v16i8 ||
18319+
(VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
1830018320
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
1830118321
unsigned ShiftOpcode = Op->getOpcode();
1830218322

@@ -18416,7 +18436,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
1841618436
DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
1841718437
}
1841818438

18419-
if (Subtarget->hasInt256() && VT == MVT::v16i16) {
18439+
if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
1842018440
MVT ExtVT = MVT::v8i32;
1842118441
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
1842218442
SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
@@ -19820,6 +19840,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
1982019840
case X86ISD::RDSEED: return "X86ISD::RDSEED";
1982119841
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
1982219842
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
19843+
case X86ISD::VPSHA: return "X86ISD::VPSHA";
19844+
case X86ISD::VPSHL: return "X86ISD::VPSHL";
1982319845
case X86ISD::FMADD: return "X86ISD::FMADD";
1982419846
case X86ISD::FMSUB: return "X86ISD::FMSUB";
1982519847
case X86ISD::FNMADD: return "X86ISD::FNMADD";

‎llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,9 @@ namespace llvm {
410410
/// SSE4A Extraction and Insertion.
411411
EXTRQI, INSERTQI,
412412

413+
// XOP arithmetic/logical shifts
414+
VPSHA, VPSHL,
415+
413416
// Vector multiply packed unsigned doubleword integers
414417
PMULUDQ,
415418
// Vector multiply packed signed doubleword integers

‎llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,13 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
215215
def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
216216
def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
217217

218+
def X86vpshl : SDNode<"X86ISD::VPSHL",
219+
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
220+
SDTCisVec<2>]>>;
221+
def X86vpsha : SDNode<"X86ISD::VPSHA",
222+
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
223+
SDTCisVec<2>]>>;
224+
218225
def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
219226
SDTCisVec<1>,
220227
SDTCisSameAs<2, 1>]>;

‎llvm/lib/Target/X86/X86InstrXOP.td

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,42 @@ let ExeDomain = SSEPackedDouble in {
8383
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
8484
}
8585

86-
multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
86+
multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
87+
ValueType vt128> {
88+
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
89+
(ins VR128:$src1, VR128:$src2),
90+
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
91+
[(set VR128:$dst,
92+
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
93+
XOP_4VOp3, Sched<[WriteVarVecShift]>;
94+
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
95+
(ins VR128:$src1, i128mem:$src2),
96+
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
97+
[(set VR128:$dst,
98+
(vt128 (OpNode (vt128 VR128:$src1),
99+
(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
100+
XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
101+
def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
102+
(ins i128mem:$src1, VR128:$src2),
103+
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
104+
[(set VR128:$dst,
105+
(vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
106+
(vt128 VR128:$src2))))]>,
107+
XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
108+
}
109+
110+
let ExeDomain = SSEPackedInt in {
111+
defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
112+
defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
113+
defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
114+
defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
115+
defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
116+
defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
117+
defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
118+
defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
119+
}
120+
121+
multiclass xop3op_int<bits<8> opc, string OpcodeStr, Intrinsic Int> {
87122
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
88123
(ins VR128:$src1, VR128:$src2),
89124
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -103,18 +138,10 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
103138
}
104139

105140
let ExeDomain = SSEPackedInt in {
106-
defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
107-
defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
108-
defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
109-
defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
110-
defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
111-
defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
112-
defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
113-
defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
114-
defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
115-
defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
116-
defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
117-
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
141+
defm VPROTW : xop3op_int<0x91, "vprotw", int_x86_xop_vprotw>;
142+
defm VPROTQ : xop3op_int<0x93, "vprotq", int_x86_xop_vprotq>;
143+
defm VPROTD : xop3op_int<0x92, "vprotd", int_x86_xop_vprotd>;
144+
defm VPROTB : xop3op_int<0x90, "vprotb", int_x86_xop_vprotb>;
118145
}
119146

120147
multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {

‎llvm/lib/Target/X86/X86IntrinsicsInfo.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1661,7 +1661,15 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
16611661
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
16621662
X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
16631663
X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
1664-
X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0)
1664+
X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
1665+
X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
1666+
X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
1667+
X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
1668+
X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
1669+
X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
1670+
X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
1671+
X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
1672+
X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
16651673
};
16661674

16671675
/*

‎llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,12 @@ int X86TTIImpl::getArithmeticInstrCost(
140140
{ ISD::SRA, MVT::v8i64, 1 },
141141
};
142142

143+
if (ST->hasAVX512()) {
144+
int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
145+
if (Idx != -1)
146+
return LT.first * AVX512CostTable[Idx].Cost;
147+
}
148+
143149
static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
144150
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
145151
// customize them to detect the cases where shift amount is a scalar one.
@@ -153,7 +159,59 @@ int X86TTIImpl::getArithmeticInstrCost(
153159
{ ISD::SRL, MVT::v2i64, 1 },
154160
{ ISD::SHL, MVT::v4i64, 1 },
155161
{ ISD::SRL, MVT::v4i64, 1 },
162+
};
163+
164+
// Look for AVX2 lowering tricks.
165+
if (ST->hasAVX2()) {
166+
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
167+
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
168+
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
169+
// On AVX2, a packed v16i16 shift left by a constant build_vector
170+
// is lowered into a vector multiply (vpmullw).
171+
return LT.first;
156172

173+
int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
174+
if (Idx != -1)
175+
return LT.first * AVX2CostTable[Idx].Cost;
176+
}
177+
178+
static const CostTblEntry<MVT::SimpleValueType> XOPCostTable[] = {
179+
// 128bit shifts take 1cy, but right shifts require negation beforehand.
180+
{ ISD::SHL, MVT::v16i8, 1 },
181+
{ ISD::SRL, MVT::v16i8, 2 },
182+
{ ISD::SRA, MVT::v16i8, 2 },
183+
{ ISD::SHL, MVT::v8i16, 1 },
184+
{ ISD::SRL, MVT::v8i16, 2 },
185+
{ ISD::SRA, MVT::v8i16, 2 },
186+
{ ISD::SHL, MVT::v4i32, 1 },
187+
{ ISD::SRL, MVT::v4i32, 2 },
188+
{ ISD::SRA, MVT::v4i32, 2 },
189+
{ ISD::SHL, MVT::v2i64, 1 },
190+
{ ISD::SRL, MVT::v2i64, 2 },
191+
{ ISD::SRA, MVT::v2i64, 2 },
192+
// 256bit shifts require splitting if AVX2 didn't catch them above.
193+
{ ISD::SHL, MVT::v32i8, 2 },
194+
{ ISD::SRL, MVT::v32i8, 4 },
195+
{ ISD::SRA, MVT::v32i8, 4 },
196+
{ ISD::SHL, MVT::v16i16, 2 },
197+
{ ISD::SRL, MVT::v16i16, 4 },
198+
{ ISD::SRA, MVT::v16i16, 4 },
199+
{ ISD::SHL, MVT::v8i32, 2 },
200+
{ ISD::SRL, MVT::v8i32, 4 },
201+
{ ISD::SRA, MVT::v8i32, 4 },
202+
{ ISD::SHL, MVT::v4i64, 2 },
203+
{ ISD::SRL, MVT::v4i64, 4 },
204+
{ ISD::SRA, MVT::v4i64, 4 },
205+
};
206+
207+
// Look for XOP lowering tricks.
208+
if (ST->hasXOP()) {
209+
int Idx = CostTableLookup(XOPCostTable, ISD, LT.second);
210+
if (Idx != -1)
211+
return LT.first * XOPCostTable[Idx].Cost;
212+
}
213+
214+
static const CostTblEntry<MVT::SimpleValueType> AVX2CustomCostTable[] = {
157215
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
158216
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
159217

@@ -176,23 +234,11 @@ int X86TTIImpl::getArithmeticInstrCost(
176234
{ ISD::UDIV, MVT::v4i64, 4*20 },
177235
};
178236

179-
if (ST->hasAVX512()) {
180-
int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
181-
if (Idx != -1)
182-
return LT.first * AVX512CostTable[Idx].Cost;
183-
}
184-
// Look for AVX2 lowering tricks.
237+
// Look for AVX2 lowering tricks for custom cases.
185238
if (ST->hasAVX2()) {
186-
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
187-
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
188-
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
189-
// On AVX2, a packed v16i16 shift left by a constant build_vector
190-
// is lowered into a vector multiply (vpmullw).
191-
return LT.first;
192-
193-
int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
239+
int Idx = CostTableLookup(AVX2CustomCostTable, ISD, LT.second);
194240
if (Idx != -1)
195-
return LT.first * AVX2CostTable[Idx].Cost;
241+
return LT.first * AVX2CustomCostTable[Idx].Cost;
196242
}
197243

198244
static const CostTblEntry<MVT::SimpleValueType>

0 commit comments

Comments
 (0)