@@ -8163,6 +8163,84 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8163
8163
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8164
8164
}
8165
8165
8166
+ // Check for whether we can use INSERTPS to perform the shuffle. We only use
8167
+ // INSERTPS when the V1 elements are already in the correct locations
8168
+ // because otherwise we can just always use two SHUFPS instructions which
8169
+ // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8170
+ // perform INSERTPS if a single V1 element is out of place and all V2
8171
+ // elements are zeroable.
8172
+ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8173
+ ArrayRef<int> Mask,
8174
+ SelectionDAG &DAG) {
8175
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8176
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8177
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8178
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8179
+
8180
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8181
+
8182
+ unsigned ZMask = 0;
8183
+ int V1DstIndex = -1;
8184
+ int V2DstIndex = -1;
8185
+ bool V1UsedInPlace = false;
8186
+
8187
+ for (int i = 0; i < 4; i++) {
8188
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
8189
+ if (Zeroable[i]) {
8190
+ ZMask |= 1 << i;
8191
+ continue;
8192
+ }
8193
+
8194
+ // Flag if we use any V1 inputs in place.
8195
+ if (i == Mask[i]) {
8196
+ V1UsedInPlace = true;
8197
+ continue;
8198
+ }
8199
+
8200
+ // We can only insert a single non-zeroable element.
8201
+ if (V1DstIndex != -1 || V2DstIndex != -1)
8202
+ return SDValue();
8203
+
8204
+ if (Mask[i] < 4) {
8205
+ // V1 input out of place for insertion.
8206
+ V1DstIndex = i;
8207
+ } else {
8208
+ // V2 input for insertion.
8209
+ V2DstIndex = i;
8210
+ }
8211
+ }
8212
+
8213
+ // Don't bother if we have no (non-zeroable) element for insertion.
8214
+ if (V1DstIndex == -1 && V2DstIndex == -1)
8215
+ return SDValue();
8216
+
8217
+ // Determine element insertion src/dst indices. The src index is from the
8218
+ // start of the inserted vector, not the start of the concatenated vector.
8219
+ unsigned V2SrcIndex = 0;
8220
+ if (V1DstIndex != -1) {
8221
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
8222
+ // and don't use the original V2 at all.
8223
+ V2SrcIndex = Mask[V1DstIndex];
8224
+ V2DstIndex = V1DstIndex;
8225
+ V2 = V1;
8226
+ } else {
8227
+ V2SrcIndex = Mask[V2DstIndex] - 4;
8228
+ }
8229
+
8230
+ // If no V1 inputs are used in place, then the result is created only from
8231
+ // the zero mask and the V2 insertion - so remove V1 dependency.
8232
+ if (!V1UsedInPlace)
8233
+ V1 = DAG.getUNDEF(MVT::v4f32);
8234
+
8235
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8236
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8237
+
8238
+ // Insert the V2 element into the desired position.
8239
+ SDLoc DL(Op);
8240
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8241
+ DAG.getConstant(InsertPSMask, MVT::i8));
8242
+ }
8243
+
8166
8244
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8167
8245
///
8168
8246
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8468,52 +8546,14 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8468
8546
Mask, Subtarget, DAG))
8469
8547
return V;
8470
8548
8471
- if (Subtarget->hasSSE41())
8549
+ if (Subtarget->hasSSE41()) {
8472
8550
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8473
8551
Subtarget, DAG))
8474
8552
return Blend;
8475
8553
8476
- // Check for whether we can use INSERTPS to perform the blend. We only use
8477
- // INSERTPS when the V1 elements are already in the correct locations
8478
- // because otherwise we can just always use two SHUFPS instructions which
8479
- // are much smaller to encode than a SHUFPS and an INSERTPS.
8480
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
8481
- int V2Index =
8482
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8483
- Mask.begin();
8484
-
8485
- // When using INSERTPS we can zero any lane of the destination. Collect
8486
- // the zero inputs into a mask and drop them from the lanes of V1 which
8487
- // actually need to be present as inputs to the INSERTPS.
8488
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8489
-
8490
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
8491
- bool InsertNeedsShuffle = false;
8492
- unsigned ZMask = 0;
8493
- for (int i = 0; i < 4; ++i)
8494
- if (i != V2Index) {
8495
- if (Zeroable[i]) {
8496
- ZMask |= 1 << i;
8497
- } else if (Mask[i] != i) {
8498
- InsertNeedsShuffle = true;
8499
- break;
8500
- }
8501
- }
8502
-
8503
- // We don't want to use INSERTPS or other insertion techniques if it will
8504
- // require shuffling anyways.
8505
- if (!InsertNeedsShuffle) {
8506
- // If all of V1 is zeroable, replace it with undef.
8507
- if ((ZMask | 1 << V2Index) == 0xF)
8508
- V1 = DAG.getUNDEF(MVT::v4f32);
8509
-
8510
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
8511
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8512
-
8513
- // Insert the V2 element into the desired position.
8514
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8515
- DAG.getConstant(InsertPSMask, MVT::i8));
8516
- }
8554
+ // Use INSERTPS if we can complete the shuffle efficiently.
8555
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8556
+ return V;
8517
8557
}
8518
8558
8519
8559
// Otherwise fall back to a SHUFPS lowering strategy.
0 commit comments