Skip to content

Commit 1358d86

Browse files
committedFeb 1, 2016
[X86][SSE] Find source of the inserted element of INSERTPS
Minor patch to trace back through target shuffles to the source of the inserted element in a (V)INSERTPS shuffle. Differential Revision: http://reviews.llvm.org/D16652 llvm-svn: 259343
1 parent 6cc9115 commit 1358d86

File tree

2 files changed

+33
-19
lines changed

2 files changed

+33
-19
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4317,6 +4317,11 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
43174317
return (Val < 0 || Val == CmpVal);
43184318
}
43194319

4320+
/// Val is either the undef or zero sentinel value.
4321+
static bool isUndefOrZero(int Val) {
4322+
return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4323+
}
4324+
43204325
/// Return true if every element in Mask, beginning
43214326
/// from position Pos and ending in Pos+Size, falls within the specified
43224327
/// sequential range (Low, Low+Size]. or is undef.
@@ -23989,6 +23994,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
2398923994
SDValue Op1 = N.getOperand(1);
2399023995
SDValue Op2 = N.getOperand(2);
2399123996
unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
23997+
unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
2399223998
unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
2399323999
unsigned ZeroMask = InsertPSMask & 0xF;
2399424000

@@ -24002,19 +24008,38 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
2400224008
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
2400324009
DAG.getConstant(InsertPSMask, DL, MVT::i8));
2400424010

24005-
SmallVector<int, 8> TargetMask;
24006-
if (!setTargetShuffleZeroElements(Op0, TargetMask))
24011+
// Attempt to merge insertps Op1 with an inner target shuffle node.
24012+
SmallVector<int, 8> TargetMask1;
24013+
if (setTargetShuffleZeroElements(Op1, TargetMask1)) {
24014+
int M = TargetMask1[SrcIdx];
24015+
if (isUndefOrZero(M)) {
24016+
// Zero/UNDEF insertion - zero out element and remove dependency.
24017+
InsertPSMask |= (1u << DstIdx);
24018+
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
24019+
DAG.getConstant(InsertPSMask, DL, MVT::i8));
24020+
}
24021+
// Update insertps mask srcidx and reference the source input directly.
24022+
assert(0 <= M && M < 8 && "Shuffle index out of range");
24023+
InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
24024+
Op1 = Op1.getOperand(M < 4 ? 0 : 1);
24025+
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
24026+
DAG.getConstant(InsertPSMask, DL, MVT::i8));
24027+
}
24028+
24029+
// Attempt to merge insertps Op0 with an inner target shuffle node.
24030+
SmallVector<int, 8> TargetMask0;
24031+
if (!setTargetShuffleZeroElements(Op0, TargetMask0))
2400724032
return SDValue();
2400824033

2400924034
bool Updated = false;
2401024035
bool UseInput00 = false;
2401124036
bool UseInput01 = false;
2401224037
for (int i = 0; i != 4; ++i) {
24013-
int M = TargetMask[i];
24038+
int M = TargetMask0[i];
2401424039
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
2401524040
// No change if element is already zero or the inserted element.
2401624041
continue;
24017-
} else if (M < 0) {
24042+
} else if (isUndefOrZero(M)) {
2401824043
// If the target mask is undef/zero then we must zero the element.
2401924044
InsertPSMask |= (1u << i);
2402024045
Updated = true;

‎llvm/test/CodeGen/X86/sse41.ll

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -846,16 +846,12 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
846846
; X32: ## BB#0:
847847
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
848848
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
849-
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
850-
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
851-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
849+
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
852850
; X32-NEXT: retl
853851
;
854852
; X64-LABEL: insertps_from_broadcast_loadf32:
855853
; X64: ## BB#0:
856-
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
857-
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
858-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
854+
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
859855
; X64-NEXT: retq
860856
%1 = getelementptr inbounds float, float* %fb, i64 %index
861857
%2 = load float, float* %1, align 4
@@ -871,16 +867,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
871867
; X32-LABEL: insertps_from_broadcast_loadv4f32:
872868
; X32: ## BB#0:
873869
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
874-
; X32-NEXT: movups (%eax), %xmm1
875-
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
876-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
870+
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
877871
; X32-NEXT: retl
878872
;
879873
; X64-LABEL: insertps_from_broadcast_loadv4f32:
880874
; X64: ## BB#0:
881-
; X64-NEXT: movups (%rdi), %xmm1
882-
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
883-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
875+
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
884876
; X64-NEXT: retq
885877
%1 = load <4 x float>, <4 x float>* %b, align 4
886878
%2 = extractelement <4 x float> %1, i32 0
@@ -892,14 +884,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
892884
ret <4 x float> %7
893885
}
894886

895-
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
896887
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
897888
; X32-LABEL: insertps_from_broadcast_multiple_use:
898889
; X32: ## BB#0:
899890
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
900891
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
901892
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
902-
; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
903893
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
904894
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
905895
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
@@ -912,7 +902,6 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
912902
; X64-LABEL: insertps_from_broadcast_multiple_use:
913903
; X64: ## BB#0:
914904
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
915-
; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
916905
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
917906
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
918907
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]

0 commit comments

Comments
 (0)