Skip to content

Commit 8e9a843

Browse files
committedJan 14, 2019
[CodeGen][X86] Expand USUBSAT to UMAX+SUB, also for vectors
Related to https://bugs.llvm.org/show_bug.cgi?id=40123. Rather than scalarizing, expand a vector USUBSAT into UMAX+SUB, which produces much better code for X86. Differential Revision: https://reviews.llvm.org/D56636 llvm-svn: 351125
1 parent 544fa42 commit 8e9a843

File tree

6 files changed

+778
-1651
lines changed

6 files changed

+778
-1651
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class VectorLegalizer {
141141
SDValue ExpandFunnelShift(SDValue Op);
142142
SDValue ExpandROT(SDValue Op);
143143
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
144+
SDValue ExpandAddSubSat(SDValue Op);
144145
SDValue ExpandStrictFPOp(SDValue Op);
145146

146147
/// Implements vector promotion.
@@ -777,6 +778,11 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
777778
case ISD::FMINNUM:
778779
case ISD::FMAXNUM:
779780
return ExpandFMINNUM_FMAXNUM(Op);
781+
case ISD::USUBSAT:
782+
case ISD::SSUBSAT:
783+
case ISD::UADDSAT:
784+
case ISD::SADDSAT:
785+
return ExpandAddSubSat(Op);
780786
case ISD::STRICT_FADD:
781787
case ISD::STRICT_FSUB:
782788
case ISD::STRICT_FMUL:
@@ -1206,6 +1212,12 @@ SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
12061212
return DAG.UnrollVectorOp(Op.getNode());
12071213
}
12081214

1215+
SDValue VectorLegalizer::ExpandAddSubSat(SDValue Op) {
1216+
if (SDValue Expanded = TLI.expandAddSubSat(Op.getNode(), DAG))
1217+
return Expanded;
1218+
return DAG.UnrollVectorOp(Op.getNode());
1219+
}
1220+
12091221
SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
12101222
EVT VT = Op.getValueType();
12111223
EVT EltVT = VT.getVectorElementType();

‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5277,6 +5277,22 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
52775277

52785278
SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
52795279
unsigned Opcode = Node->getOpcode();
5280+
SDValue LHS = Node->getOperand(0);
5281+
SDValue RHS = Node->getOperand(1);
5282+
EVT VT = LHS.getValueType();
5283+
SDLoc dl(Node);
5284+
5285+
// usub.sat(a, b) -> umax(a, b) - b
5286+
if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
5287+
SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
5288+
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
5289+
}
5290+
5291+
if (VT.isVector()) {
5292+
// TODO: Consider not scalarizing here.
5293+
return SDValue();
5294+
}
5295+
52805296
unsigned OverflowOp;
52815297
switch (Opcode) {
52825298
case ISD::SADDSAT:
@@ -5295,11 +5311,7 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
52955311
llvm_unreachable("Expected method to receive signed or unsigned saturation "
52965312
"addition or subtraction node.");
52975313
}
5298-
assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");
52995314

5300-
SDLoc dl(Node);
5301-
SDValue LHS = Node->getOperand(0);
5302-
SDValue RHS = Node->getOperand(1);
53035315
assert(LHS.getValueType().isScalarInteger() &&
53045316
"Expected operands to be integers. Vector of int arguments should "
53055317
"already be unrolled.");

‎llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,6 +1780,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
17801780
{ ISD::CTPOP, MVT::v16i32, 24 },
17811781
{ ISD::CTTZ, MVT::v8i64, 20 },
17821782
{ ISD::CTTZ, MVT::v16i32, 28 },
1783+
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1784+
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1785+
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1786+
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
17831787
};
17841788
static const CostTblEntry XOPCostTbl[] = {
17851789
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -1823,6 +1827,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
18231827
{ ISD::UADDSAT, MVT::v32i8, 1 },
18241828
{ ISD::USUBSAT, MVT::v16i16, 1 },
18251829
{ ISD::USUBSAT, MVT::v32i8, 1 },
1830+
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
18261831
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
18271832
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
18281833
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -1858,6 +1863,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
18581863
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
18591864
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
18601865
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1866+
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
18611867
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
18621868
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
18631869
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -1878,6 +1884,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
18781884
{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
18791885
};
18801886
static const CostTblEntry SSE42CostTbl[] = {
1887+
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
18811888
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
18821889
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
18831890
};

‎llvm/test/Analysis/CostModel/X86/arith-usat.ll

Lines changed: 70 additions & 51 deletions
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/usub_sat.ll

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -112,37 +112,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
112112
;
113113
; X64-LABEL: vec:
114114
; X64: # %bb.0:
115-
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
116-
; X64-NEXT: movd %xmm2, %eax
117-
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
118-
; X64-NEXT: movd %xmm2, %ecx
119-
; X64-NEXT: xorl %edx, %edx
120-
; X64-NEXT: subl %eax, %ecx
121-
; X64-NEXT: cmovbl %edx, %ecx
122-
; X64-NEXT: movd %ecx, %xmm2
123-
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
124-
; X64-NEXT: movd %xmm3, %eax
125-
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
126-
; X64-NEXT: movd %xmm3, %ecx
127-
; X64-NEXT: subl %eax, %ecx
128-
; X64-NEXT: cmovbl %edx, %ecx
129-
; X64-NEXT: movd %ecx, %xmm3
130-
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
131-
; X64-NEXT: movd %xmm1, %eax
132-
; X64-NEXT: movd %xmm0, %ecx
133-
; X64-NEXT: subl %eax, %ecx
134-
; X64-NEXT: cmovbl %edx, %ecx
135-
; X64-NEXT: movd %ecx, %xmm2
136-
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
137-
; X64-NEXT: movd %xmm1, %eax
138-
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
139-
; X64-NEXT: movd %xmm0, %ecx
140-
; X64-NEXT: subl %eax, %ecx
141-
; X64-NEXT: cmovbl %edx, %ecx
142-
; X64-NEXT: movd %ecx, %xmm0
143-
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
144-
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
145-
; X64-NEXT: movdqa %xmm2, %xmm0
115+
; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
116+
; X64-NEXT: movdqa %xmm1, %xmm3
117+
; X64-NEXT: pxor %xmm2, %xmm3
118+
; X64-NEXT: pxor %xmm0, %xmm2
119+
; X64-NEXT: pcmpgtd %xmm3, %xmm2
120+
; X64-NEXT: pand %xmm2, %xmm0
121+
; X64-NEXT: pandn %xmm1, %xmm2
122+
; X64-NEXT: por %xmm2, %xmm0
123+
; X64-NEXT: psubd %xmm1, %xmm0
146124
; X64-NEXT: retq
147125
%tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
148126
ret <4 x i32> %tmp;

‎llvm/test/CodeGen/X86/usub_sat_vec.ll

Lines changed: 664 additions & 1565 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.