Skip to content

Commit 94a4cc0

Browse files
committedJan 10, 2015
[X86][SSE] Improved (v)insertps shuffle matching
In the current code we only attempt to match against insertps if we have exactly one element from the second input vector, irrespective of how much of the shuffle result is zeroable. This patch checks to see if there is a single non-zeroable element from either input that requires insertion. It also supports matching of cases where only one of the inputs need to be referenced. We also split insertps shuffle matching off into a new lowerVectorShuffleAsInsertPS function. Differential Revision: http://reviews.llvm.org/D6879 llvm-svn: 225589
1 parent 9be98b6 commit 94a4cc0

File tree

4 files changed

+111
-61
lines changed

4 files changed

+111
-61
lines changed
 

Diff for: ‎llvm/lib/Target/X86/X86ISelLowering.cpp

+82-42
Original file line numberDiff line numberDiff line change
@@ -8163,6 +8163,84 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
81638163
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
81648164
}
81658165

8166+
// Check for whether we can use INSERTPS to perform the shuffle. We only use
8167+
// INSERTPS when the V1 elements are already in the correct locations
8168+
// because otherwise we can just always use two SHUFPS instructions which
8169+
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8170+
// perform INSERTPS if a single V1 element is out of place and all V2
8171+
// elements are zeroable.
8172+
static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8173+
ArrayRef<int> Mask,
8174+
SelectionDAG &DAG) {
8175+
assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8176+
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8177+
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8178+
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8179+
8180+
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8181+
8182+
unsigned ZMask = 0;
8183+
int V1DstIndex = -1;
8184+
int V2DstIndex = -1;
8185+
bool V1UsedInPlace = false;
8186+
8187+
for (int i = 0; i < 4; i++) {
8188+
// Synthesize a zero mask from the zeroable elements (includes undefs).
8189+
if (Zeroable[i]) {
8190+
ZMask |= 1 << i;
8191+
continue;
8192+
}
8193+
8194+
// Flag if we use any V1 inputs in place.
8195+
if (i == Mask[i]) {
8196+
V1UsedInPlace = true;
8197+
continue;
8198+
}
8199+
8200+
// We can only insert a single non-zeroable element.
8201+
if (V1DstIndex != -1 || V2DstIndex != -1)
8202+
return SDValue();
8203+
8204+
if (Mask[i] < 4) {
8205+
// V1 input out of place for insertion.
8206+
V1DstIndex = i;
8207+
} else {
8208+
// V2 input for insertion.
8209+
V2DstIndex = i;
8210+
}
8211+
}
8212+
8213+
// Don't bother if we have no (non-zeroable) element for insertion.
8214+
if (V1DstIndex == -1 && V2DstIndex == -1)
8215+
return SDValue();
8216+
8217+
// Determine element insertion src/dst indices. The src index is from the
8218+
// start of the inserted vector, not the start of the concatenated vector.
8219+
unsigned V2SrcIndex = 0;
8220+
if (V1DstIndex != -1) {
8221+
// If we have a V1 input out of place, we use V1 as the V2 element insertion
8222+
// and don't use the original V2 at all.
8223+
V2SrcIndex = Mask[V1DstIndex];
8224+
V2DstIndex = V1DstIndex;
8225+
V2 = V1;
8226+
} else {
8227+
V2SrcIndex = Mask[V2DstIndex] - 4;
8228+
}
8229+
8230+
// If no V1 inputs are used in place, then the result is created only from
8231+
// the zero mask and the V2 insertion - so remove V1 dependency.
8232+
if (!V1UsedInPlace)
8233+
V1 = DAG.getUNDEF(MVT::v4f32);
8234+
8235+
unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8236+
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8237+
8238+
// Insert the V2 element into the desired position.
8239+
SDLoc DL(Op);
8240+
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8241+
DAG.getConstant(InsertPSMask, MVT::i8));
8242+
}
8243+
81668244
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
81678245
///
81688246
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8468,52 +8546,14 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
84688546
Mask, Subtarget, DAG))
84698547
return V;
84708548

8471-
if (Subtarget->hasSSE41())
8549+
if (Subtarget->hasSSE41()) {
84728550
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
84738551
Subtarget, DAG))
84748552
return Blend;
84758553

8476-
// Check for whether we can use INSERTPS to perform the blend. We only use
8477-
// INSERTPS when the V1 elements are already in the correct locations
8478-
// because otherwise we can just always use two SHUFPS instructions which
8479-
// are much smaller to encode than a SHUFPS and an INSERTPS.
8480-
if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
8481-
int V2Index =
8482-
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8483-
Mask.begin();
8484-
8485-
// When using INSERTPS we can zero any lane of the destination. Collect
8486-
// the zero inputs into a mask and drop them from the lanes of V1 which
8487-
// actually need to be present as inputs to the INSERTPS.
8488-
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8489-
8490-
// Synthesize a shuffle mask for the non-zero and non-v2 inputs.
8491-
bool InsertNeedsShuffle = false;
8492-
unsigned ZMask = 0;
8493-
for (int i = 0; i < 4; ++i)
8494-
if (i != V2Index) {
8495-
if (Zeroable[i]) {
8496-
ZMask |= 1 << i;
8497-
} else if (Mask[i] != i) {
8498-
InsertNeedsShuffle = true;
8499-
break;
8500-
}
8501-
}
8502-
8503-
// We don't want to use INSERTPS or other insertion techniques if it will
8504-
// require shuffling anyways.
8505-
if (!InsertNeedsShuffle) {
8506-
// If all of V1 is zeroable, replace it with undef.
8507-
if ((ZMask | 1 << V2Index) == 0xF)
8508-
V1 = DAG.getUNDEF(MVT::v4f32);
8509-
8510-
unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
8511-
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8512-
8513-
// Insert the V2 element into the desired position.
8514-
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8515-
DAG.getConstant(InsertPSMask, MVT::i8));
8516-
}
8554+
// Use INSERTPS if we can complete the shuffle efficiently.
8555+
if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8556+
return V;
85178557
}
85188558

85198559
// Otherwise fall back to a SHUFPS lowering strategy.

Diff for: ‎llvm/test/CodeGen/X86/combine-or.ll

+4-6
Original file line numberDiff line numberDiff line change
@@ -240,12 +240,10 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
240240
; CHECK-LABEL: test19:
241241
; CHECK: # BB#0:
242242
; CHECK-NEXT: xorps %xmm2, %xmm2
243-
; CHECK-NEXT: xorps %xmm3, %xmm3
244-
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
245-
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
246-
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
247-
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
248-
; CHECK-NEXT: orps %xmm3, %xmm2
243+
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3]
244+
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
245+
; CHECK-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2]
246+
; CHECK-NEXT: orps %xmm1, %xmm2
249247
; CHECK-NEXT: movaps %xmm2, %xmm0
250248
; CHECK-NEXT: retq
251249
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>

Diff for: ‎llvm/test/CodeGen/X86/masked_memop.ll

+5-5
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float
7171
; AVX2-LABEL: test5
7272
; AVX2: vmaskmovpd
7373
; AVX2: vblendvpd
74-
; AVX2: vmaskmovpd
74+
; AVX2: vmaskmovpd
7575
; AVX2: vblendvpd
7676
define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
7777
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -150,7 +150,7 @@ define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val)
150150
}
151151

152152
; AVX2-LABEL: test14
153-
; AVX2: vshufps $-24
153+
; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
154154
; AVX2: vmaskmovps
155155
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
156156
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -194,16 +194,16 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
194194
}
195195

196196

197-
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
197+
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
198198
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
199199
declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
200200
declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
201201
declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
202202
declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
203203
declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
204204
declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
205-
declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
206-
declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
205+
declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
206+
declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
207207
declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
208208
declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
209209
declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)

Diff for: ‎llvm/test/CodeGen/X86/vector-shuffle-combining.ll

+20-8
Original file line numberDiff line numberDiff line change
@@ -553,18 +553,30 @@ define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i3
553553
}
554554

555555
define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
556-
; SSE-LABEL: combine_bitwise_ops_test3c:
557-
; SSE: # BB#0:
558-
; SSE-NEXT: xorps %xmm1, %xmm0
559-
; SSE-NEXT: xorps %xmm1, %xmm1
560-
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
561-
; SSE-NEXT: retq
556+
; SSE2-LABEL: combine_bitwise_ops_test3c:
557+
; SSE2: # BB#0:
558+
; SSE2-NEXT: xorps %xmm1, %xmm0
559+
; SSE2-NEXT: xorps %xmm1, %xmm1
560+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
561+
; SSE2-NEXT: retq
562+
;
563+
; SSSE3-LABEL: combine_bitwise_ops_test3c:
564+
; SSSE3: # BB#0:
565+
; SSSE3-NEXT: xorps %xmm1, %xmm0
566+
; SSSE3-NEXT: xorps %xmm1, %xmm1
567+
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
568+
; SSSE3-NEXT: retq
569+
;
570+
; SSE41-LABEL: combine_bitwise_ops_test3c:
571+
; SSE41: # BB#0:
572+
; SSE41-NEXT: xorps %xmm1, %xmm0
573+
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
574+
; SSE41-NEXT: retq
562575
;
563576
; AVX-LABEL: combine_bitwise_ops_test3c:
564577
; AVX: # BB#0:
565578
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
566-
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
567-
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
579+
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
568580
; AVX-NEXT: retq
569581
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
570582
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>

0 commit comments

Comments
 (0)
Please sign in to comment.