Changeset View
Changeset View
Standalone View
Standalone View
lib/Target/X86/X86ISelLowering.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 10,741 Lines • ▼ Show 20 Lines | return DAG.getVectorShuffle( | ||||
VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, | VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, | ||||
DL, VT, V1, V2), | DL, VT, V1, V2), | ||||
DAG.getUNDEF(VT), PermMask); | DAG.getUNDEF(VT), PermMask); | ||||
} | } | ||||
return SDValue(); | return SDValue(); | ||||
} | } | ||||
// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||
// lane comes from V2 and the lanes do the same operation. We can create | |||||
// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||
// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||
// repeated lane shuffle. | |||||
static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL, | |||||
MVT VT, SDValue V1, | |||||
SDValue V2, | |||||
ArrayRef<int> Mask, | |||||
SelectionDAG &DAG) { | |||||
int Size = Mask.size(); | |||||
SmallVector<int, 8> RepeatMask(Size, -1); | |||||
RKSimon: Should't RepeatMask be just LaneSize wide? | |||||
Not Done ReplyInline ActionsYes it should. craig.topper: Yes it should. | |||||
int LaneSize = 128 / VT.getScalarSizeInBits(); | |||||
for (int i = 0; i != Size; ++i) { | |||||
int M = Mask[i]; | |||||
if (M < 0) | |||||
continue; | |||||
// Make sure the element comes from the same source as the half. | |||||
if ((M / Size) != (i / LaneSize)) | |||||
return SDValue(); | |||||
int LocalM = M % Size; | |||||
if (RepeatMask[i % LaneSize] < 0) | |||||
RepeatMask[i % LaneSize] = LocalM; | |||||
else if (RepeatMask[i % LaneSize] != LocalM) | |||||
return SDValue(); | |||||
} | |||||
SmallVector<int, 8> PermuteMask(Size, -1); | |||||
for (int i = 0; i != Size; ++i) | |||||
PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size; | |||||
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); | |||||
for (int i = 0; i != Size; ++i) | |||||
PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize; | |||||
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); | |||||
for (int i = 0; i != Size; ++i) { | |||||
int M = RepeatMask[i % LaneSize]; | |||||
PermuteMask[i] = M; | |||||
if (PermuteMask[i] < 0) | |||||
Not Done ReplyInline ActionsDo we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask[i] was UNDEF? RKSimon: Do we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask… | |||||
Not Done ReplyInline ActionsNot sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its undef in both lanes it will be undef here. craig.topper: Not sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its… | |||||
continue; | |||||
if (PermuteMask[i] >= LaneSize) | |||||
PermuteMask[i] += Size - LaneSize; | |||||
PermuteMask[i] += (i / LaneSize) * LaneSize; | |||||
} | |||||
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask); | |||||
} | |||||
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. | /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. | ||||
/// | /// | ||||
/// This is the basis function for the 2-lane 64-bit shuffles as we have full | /// This is the basis function for the 2-lane 64-bit shuffles as we have full | ||||
/// support for floating point shuffles but not integer shuffles. These | /// support for floating point shuffles but not integer shuffles. These | ||||
/// instructions will incur a domain crossing penalty on some chips though so | /// instructions will incur a domain crossing penalty on some chips though so | ||||
/// it is better to avoid lowering through this for integer vectors where | /// it is better to avoid lowering through this for integer vectors where | ||||
/// possible. | /// possible. | ||||
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, | static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, | ||||
▲ Show 20 Lines • Show All 1,924 Lines • ▼ Show 20 Lines | |||||
/// This will only succeed when the result of fixing the 128-bit lanes results | /// This will only succeed when the result of fixing the 128-bit lanes results | ||||
/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in | /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in | ||||
/// each 128-bit lanes. This handles many cases where we can quickly blend away | /// each 128-bit lanes. This handles many cases where we can quickly blend away | ||||
/// the lane crosses early and then use simpler shuffles within each lane. | /// the lane crosses early and then use simpler shuffles within each lane. | ||||
/// | /// | ||||
/// FIXME: It might be worthwhile at some point to support this without | /// FIXME: It might be worthwhile at some point to support this without | ||||
/// requiring the 128-bit lane-relative shuffles to be repeating, but currently | /// requiring the 128-bit lane-relative shuffles to be repeating, but currently | ||||
/// in x86 only floating point has interesting non-repeating shuffles, and even | /// in x86 only floating point has interesting non-repeating shuffles, and even | ||||
/// those are still *marginally* more expensive. | /// those are still *marginally* more expensive. | ||||
Not Done ReplyInline ActionsThis comment + FIXME needs updating RKSimon: This comment + FIXME needs updating | |||||
static SDValue lowerVectorShuffleByMerging128BitLanes( | static SDValue lowerVectorShuffleByMerging128BitLanes( | ||||
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | ||||
const X86Subtarget &Subtarget, SelectionDAG &DAG) { | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | ||||
assert(!V2.isUndef() && "This is only useful with multiple inputs."); | assert(!V2.isUndef() && "This is only useful with multiple inputs."); | ||||
int Size = Mask.size(); | int Size = Mask.size(); | ||||
int LaneSize = 128 / VT.getScalarSizeInBits(); | int LaneSize = 128 / VT.getScalarSizeInBits(); | ||||
int NumLanes = Size / LaneSize; | int NumLanes = Size / LaneSize; | ||||
Not Done ReplyInline ActionsThere is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead RKSimon: There is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead | |||||
assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); | assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); | ||||
// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also | // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also | ||||
// check whether the in-128-bit lane shuffles share a repeating pattern. | // check whether the in-128-bit lane shuffles share a repeating pattern. | ||||
SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); | SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); | ||||
SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); | SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); | ||||
for (int i = 0; i < Size; ++i) { | for (int i = 0; i < Size; ++i) { | ||||
if (Mask[i] < 0) | if (Mask[i] < 0) | ||||
Not Done ReplyInline ActionsThis seems easier to grok (at least to me): int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % LaneSize) + Src * Size; RKSimon: This seems easier to grok (at least to me):
```
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)… | |||||
continue; | continue; | ||||
int j = i / LaneSize; | int j = i / LaneSize; | ||||
if (Lanes[j] < 0) { | if (Lanes[j] < 0) { | ||||
// First entry we've seen for this lane. | // First entry we've seen for this lane. | ||||
Lanes[j] = Mask[i] / LaneSize; | Lanes[j] = Mask[i] / LaneSize; | ||||
} else if (Lanes[j] != Mask[i] / LaneSize) { | } else if (Lanes[j] != Mask[i] / LaneSize) { | ||||
// This doesn't match the lane selected previously! | // This doesn't match the lane selected previously! | ||||
return SDValue(); | return SDValue(); | ||||
} | } | ||||
Not Done ReplyInline Actionsfor (int i = 0, e = M1.size(); i != e; ++i) RKSimon: for (int i = 0, e = M1.size(); i != e; ++i) | |||||
// Check that within each lane we have a consistent shuffle mask. | // Check that within each lane we have a consistent shuffle mask. | ||||
int k = i % LaneSize; | int k = i % LaneSize; | ||||
if (InLaneMask[k] < 0) { | if (InLaneMask[k] < 0) { | ||||
InLaneMask[k] = Mask[i] % LaneSize; | InLaneMask[k] = Mask[i] % LaneSize; | ||||
} else if (InLaneMask[k] != Mask[i] % LaneSize) { | } else if (InLaneMask[k] != Mask[i] % LaneSize) { | ||||
// This doesn't fit a repeating in-lane mask. | // This doesn't fit a repeating in-lane mask. | ||||
return SDValue(); | return SDValue(); | ||||
} | } | ||||
Not Done ReplyInline Actionsfor (int i = 0, e = MergedMask.size(); i != e; ++i) { RKSimon: for (int i = 0, e = MergedMask.size(); i != e; ++i) { | |||||
} | } | ||||
// First shuffle the lanes into place. | // First shuffle the lanes into place. | ||||
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, | MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, | ||||
VT.getSizeInBits() / 64); | VT.getSizeInBits() / 64); | ||||
SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); | SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); | ||||
for (int i = 0; i < NumLanes; ++i) | for (int i = 0; i < NumLanes; ++i) | ||||
if (Lanes[i] >= 0) { | if (Lanes[i] >= 0) { | ||||
▲ Show 20 Lines • Show All 468 Lines • ▼ Show 20 Lines | if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, | ||||
return V; | return V; | ||||
// If we have AVX2 then we always want to lower with a blend because an v4 we | // If we have AVX2 then we always want to lower with a blend because an v4 we | ||||
// can fully permute the elements. | // can fully permute the elements. | ||||
if (Subtarget.hasAVX2()) | if (Subtarget.hasAVX2()) | ||||
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, | return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, | ||||
Mask, DAG); | Mask, DAG); | ||||
// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||
// lane comes from V2 and the lanes do the same operation. We can create | |||||
// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||
// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||
// repeated lane shuffle. | |||||
if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v4f64, V1, V2, | |||||
Mask, DAG)) | |||||
return V; | |||||
// Otherwise fall back on generic lowering. | // Otherwise fall back on generic lowering. | ||||
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); | return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); | ||||
} | } | ||||
/// \brief Handle lowering of 4-lane 64-bit integer shuffles. | /// \brief Handle lowering of 4-lane 64-bit integer shuffles. | ||||
/// | /// | ||||
/// This routine is only called when we have AVX2 and thus a reasonable | /// This routine is only called when we have AVX2 and thus a reasonable | ||||
/// instruction set for v4i64 shuffling.. | /// instruction set for v4i64 shuffling.. | ||||
▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines | if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, | ||||
return V; | return V; | ||||
// If we have AVX2 then we always want to lower with a blend because at v8 we | // If we have AVX2 then we always want to lower with a blend because at v8 we | ||||
// can fully permute the elements. | // can fully permute the elements. | ||||
if (Subtarget.hasAVX2()) | if (Subtarget.hasAVX2()) | ||||
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, | return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, | ||||
Mask, DAG); | Mask, DAG); | ||||
// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||
// lane comes from V2 and the lanes do the same operation. We can create | |||||
// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||
// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||
// repeated lane shuffle. | |||||
if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2, | |||||
Mask, DAG)) | |||||
return V; | |||||
// Otherwise fall back on generic lowering. | // Otherwise fall back on generic lowering. | ||||
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); | return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); | ||||
} | } | ||||
/// \brief Handle lowering of 8-lane 32-bit integer shuffles. | /// \brief Handle lowering of 8-lane 32-bit integer shuffles. | ||||
/// | /// | ||||
/// This routine is only called when we have AVX2 and thus a reasonable | /// This routine is only called when we have AVX2 and thus a reasonable | ||||
/// instruction set for v8i32 shuffling.. | /// instruction set for v8i32 shuffling.. | ||||
▲ Show 20 Lines • Show All 25,417 Lines • Show Last 20 Lines |
Should't RepeatMask be just LaneSize wide?