Skip to content

Commit c5bb362

Browse files
committedOct 24, 2018
[X86][SSE] Add SimplifyDemandedBitsForTargetNode PMULDQ/PMULUDQ handling
Add X86 SimplifyDemandedBitsForTargetNode and use it to simplify PMULDQ/PMULUDQ target nodes. This enables us to repeatedly simplify the node's arguments after the previous approach had to be reverted due to PR39398. Differential Revision: https://reviews.llvm.org/D53643 llvm-svn: 345182
1 parent 0657095 commit c5bb362

File tree

4 files changed

+70
-66
lines changed

4 files changed

+70
-66
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+26-6
Original file line numberDiff line numberDiff line change
@@ -31870,6 +31870,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
3187031870
return false;
3187131871
}
3187231872

31873+
bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
31874+
SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
31875+
TargetLoweringOpt &TLO, unsigned Depth) const {
31876+
unsigned Opc = Op.getOpcode();
31877+
switch(Opc) {
31878+
case X86ISD::PMULDQ:
31879+
case X86ISD::PMULUDQ: {
31880+
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
31881+
KnownBits KnownOp;
31882+
SDValue LHS = Op.getOperand(0);
31883+
SDValue RHS = Op.getOperand(1);
31884+
APInt DemandedMask = OriginalDemandedBits & APInt::getLowBitsSet(64, 32);
31885+
if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
31886+
return true;
31887+
if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
31888+
return true;
31889+
break;
31890+
}
31891+
}
31892+
31893+
return TargetLowering::SimplifyDemandedBitsForTargetNode(
31894+
Op, OriginalDemandedBits, Known, TLO, Depth);
31895+
}
31896+
3187331897
/// Check if a vector extract from a target-specific shuffle of a load can be
3187431898
/// folded into a single element load.
3187531899
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
@@ -40362,13 +40386,9 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
4036240386
if (ISD::isBuildVectorAllZeros(RHS.getNode()))
4036340387
return RHS;
4036440388

40389+
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
4036540390
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40366-
APInt DemandedMask(APInt::getLowBitsSet(64, 32));
40367-
40368-
// PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
40369-
if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI))
40370-
return SDValue(N, 0);
40371-
if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI))
40391+
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
4037240392
return SDValue(N, 0);
4037340393

4037440394
return SDValue();

‎llvm/lib/Target/X86/X86ISelLowering.h

+6
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,12 @@ namespace llvm {
874874
TargetLoweringOpt &TLO,
875875
unsigned Depth) const override;
876876

877+
bool SimplifyDemandedBitsForTargetNode(SDValue Op,
878+
const APInt &DemandedBits,
879+
KnownBits &Known,
880+
TargetLoweringOpt &TLO,
881+
unsigned Depth) const override;
882+
877883
SDValue unwrapAddress(SDValue N) const override;
878884

879885
bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,

‎llvm/test/CodeGen/X86/combine-pmuldq.ll

+4-26
Original file line numberDiff line numberDiff line change
@@ -47,26 +47,10 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
4747
; SSE-NEXT: pmuludq %xmm1, %xmm0
4848
; SSE-NEXT: retq
4949
;
50-
; AVX2-LABEL: combine_shuffle_zero_pmuludq:
51-
; AVX2: # %bb.0:
52-
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
53-
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
54-
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
55-
; AVX2-NEXT: retq
56-
;
57-
; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
58-
; AVX512VL: # %bb.0:
59-
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
60-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
61-
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
62-
; AVX512VL-NEXT: retq
63-
;
64-
; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
65-
; AVX512DQVL: # %bb.0:
66-
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
67-
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
68-
; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
69-
; AVX512DQVL-NEXT: retq
50+
; AVX-LABEL: combine_shuffle_zero_pmuludq:
51+
; AVX: # %bb.0:
52+
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
53+
; AVX-NEXT: retq
7054
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
7155
%2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
7256
%3 = bitcast <4 x i32> %1 to <2 x i64>
@@ -84,22 +68,16 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
8468
;
8569
; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
8670
; AVX2: # %bb.0:
87-
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
88-
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
8971
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
9072
; AVX2-NEXT: retq
9173
;
9274
; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
9375
; AVX512VL: # %bb.0:
94-
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
95-
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
9676
; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
9777
; AVX512VL-NEXT: retq
9878
;
9979
; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
10080
; AVX512DQVL: # %bb.0:
101-
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
102-
; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
10381
; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
10482
; AVX512DQVL-NEXT: retq
10583
%1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>

‎llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll

+34-34
Original file line numberDiff line numberDiff line change
@@ -143,31 +143,31 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone {
143143
define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone {
144144
; CHECK-SSE2-LABEL: test_urem_even_div:
145145
; CHECK-SSE2: # %bb.0:
146-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
147-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,3435973837,2863311531,2454267027]
148-
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
149-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
146+
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027]
147+
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
148+
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
149+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
150150
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
151151
; CHECK-SSE2-NEXT: psrld $1, %xmm3
152152
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
153153
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
154-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
155-
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
156-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
157-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
158-
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
159-
; CHECK-SSE2-NEXT: psrld $2, %xmm2
160-
; CHECK-SSE2-NEXT: psrld $3, %xmm1
161-
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
162-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
154+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
155+
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
156+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
157+
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
158+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
159+
; CHECK-SSE2-NEXT: psrld $2, %xmm1
160+
; CHECK-SSE2-NEXT: psrld $3, %xmm2
161+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
162+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
163163
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14]
164164
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
165165
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
166166
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
167-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
168-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
169-
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
170-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
167+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
168+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
169+
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
170+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
171171
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
172172
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
173173
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -377,30 +377,30 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone {
377377
define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone {
378378
; CHECK-SSE2-LABEL: test_urem_one:
379379
; CHECK-SSE2: # %bb.0:
380-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
381-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,0,2863311531,2454267027]
382-
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
383-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
380+
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027]
381+
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
382+
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
383+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
384384
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
385385
; CHECK-SSE2-NEXT: psrld $1, %xmm3
386386
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
387387
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
388-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
389-
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
390-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
391-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
392-
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
393-
; CHECK-SSE2-NEXT: psrld $2, %xmm2
388+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
389+
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
390+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
391+
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
392+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
393+
; CHECK-SSE2-NEXT: psrld $2, %xmm1
394394
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
395-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
396-
; CHECK-SSE2-NEXT: psrld $3, %xmm1
397-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
398-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,1,12,14]
399-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
395+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
396+
; CHECK-SSE2-NEXT: psrld $3, %xmm2
397+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
398+
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14]
399+
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
400400
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
401401
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
402-
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
403-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
402+
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
403+
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
404404
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
405405
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
406406
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]

0 commit comments

Comments
 (0)
Please sign in to comment.