# Changeset View

Changeset View

# Standalone View

Standalone View

# lib/Target/X86/X86ISelLowering.cpp

- This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,741 Lines • ▼ Show 20 Lines | return DAG.getVectorShuffle( | ||||

VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, | VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, | ||||

DL, VT, V1, V2), | DL, VT, V1, V2), | ||||

DAG.getUNDEF(VT), PermMask); | DAG.getUNDEF(VT), PermMask); | ||||

} | } | ||||

return SDValue(); | return SDValue(); | ||||

} | } | ||||

// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||

// lane comes from V2 and the lanes do the same operation. We can create | |||||

// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||

// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||

// repeated lane shuffle. | |||||

static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL, | |||||

MVT VT, SDValue V1, | |||||

SDValue V2, | |||||

ArrayRef<int> Mask, | |||||

SelectionDAG &DAG) { | |||||

int Size = Mask.size(); | |||||

SmallVector<int, 8> RepeatMask(Size, -1); | |||||

RKSimon: Should't RepeatMask be just LaneSize wide? | |||||

Not Done ReplyInline ActionsYes it should. craig.topper: Yes it should. | |||||

int LaneSize = 128 / VT.getScalarSizeInBits(); | |||||

for (int i = 0; i != Size; ++i) { | |||||

int M = Mask[i]; | |||||

if (M < 0) | |||||

continue; | |||||

// Make sure the element comes from the same source as the half. | |||||

if ((M / Size) != (i / LaneSize)) | |||||

return SDValue(); | |||||

int LocalM = M % Size; | |||||

if (RepeatMask[i % LaneSize] < 0) | |||||

RepeatMask[i % LaneSize] = LocalM; | |||||

else if (RepeatMask[i % LaneSize] != LocalM) | |||||

return SDValue(); | |||||

} | |||||

SmallVector<int, 8> PermuteMask(Size, -1); | |||||

for (int i = 0; i != Size; ++i) | |||||

PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size; | |||||

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); | |||||

for (int i = 0; i != Size; ++i) | |||||

PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize; | |||||

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask); | |||||

for (int i = 0; i != Size; ++i) { | |||||

int M = RepeatMask[i % LaneSize]; | |||||

PermuteMask[i] = M; | |||||

if (PermuteMask[i] < 0) | |||||

Not Done ReplyInline ActionsDo we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask[i] was UNDEF? RKSimon: Do we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask… | |||||

Not Done ReplyInline ActionsNot sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its undef in both lanes it will be undef here. craig.topper: Not sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its… | |||||

continue; | |||||

if (PermuteMask[i] >= LaneSize) | |||||

PermuteMask[i] += Size - LaneSize; | |||||

PermuteMask[i] += (i / LaneSize) * LaneSize; | |||||

} | |||||

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask); | |||||

} | |||||

/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. | /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. | ||||

/// | /// | ||||

/// This is the basis function for the 2-lane 64-bit shuffles as we have full | /// This is the basis function for the 2-lane 64-bit shuffles as we have full | ||||

/// support for floating point shuffles but not integer shuffles. These | /// support for floating point shuffles but not integer shuffles. These | ||||

/// instructions will incur a domain crossing penalty on some chips though so | /// instructions will incur a domain crossing penalty on some chips though so | ||||

/// it is better to avoid lowering through this for integer vectors where | /// it is better to avoid lowering through this for integer vectors where | ||||

/// possible. | /// possible. | ||||

static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, | static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, | ||||

▲ Show 20 Lines • Show All 1,924 Lines • ▼ Show 20 Lines | |||||

/// This will only succeed when the result of fixing the 128-bit lanes results | /// This will only succeed when the result of fixing the 128-bit lanes results | ||||

/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in | /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in | ||||

/// each 128-bit lanes. This handles many cases where we can quickly blend away | /// each 128-bit lanes. This handles many cases where we can quickly blend away | ||||

/// the lane crosses early and then use simpler shuffles within each lane. | /// the lane crosses early and then use simpler shuffles within each lane. | ||||

/// | /// | ||||

/// FIXME: It might be worthwhile at some point to support this without | /// FIXME: It might be worthwhile at some point to support this without | ||||

/// requiring the 128-bit lane-relative shuffles to be repeating, but currently | /// requiring the 128-bit lane-relative shuffles to be repeating, but currently | ||||

/// in x86 only floating point has interesting non-repeating shuffles, and even | /// in x86 only floating point has interesting non-repeating shuffles, and even | ||||

/// those are still *marginally* more expensive. | /// those are still *marginally* more expensive. | ||||

Not Done ReplyInline ActionsThis comment + FIXME needs updating RKSimon: This comment + FIXME needs updating | |||||

static SDValue lowerVectorShuffleByMerging128BitLanes( | static SDValue lowerVectorShuffleByMerging128BitLanes( | ||||

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | ||||

const X86Subtarget &Subtarget, SelectionDAG &DAG) { | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | ||||

assert(!V2.isUndef() && "This is only useful with multiple inputs."); | assert(!V2.isUndef() && "This is only useful with multiple inputs."); | ||||

int Size = Mask.size(); | int Size = Mask.size(); | ||||

int LaneSize = 128 / VT.getScalarSizeInBits(); | int LaneSize = 128 / VT.getScalarSizeInBits(); | ||||

int NumLanes = Size / LaneSize; | int NumLanes = Size / LaneSize; | ||||

Not Done ReplyInline ActionsThere is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead RKSimon: There is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead | |||||

assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); | assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); | ||||

// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also | // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also | ||||

// check whether the in-128-bit lane shuffles share a repeating pattern. | // check whether the in-128-bit lane shuffles share a repeating pattern. | ||||

SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); | SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); | ||||

SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); | SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); | ||||

for (int i = 0; i < Size; ++i) { | for (int i = 0; i < Size; ++i) { | ||||

if (Mask[i] < 0) | if (Mask[i] < 0) | ||||

Not Done ReplyInline ActionsThis seems easier to grok (at least to me): int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % LaneSize) + Src * Size; RKSimon: This seems easier to grok (at least to me):
```
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)… | |||||

continue; | continue; | ||||

int j = i / LaneSize; | int j = i / LaneSize; | ||||

if (Lanes[j] < 0) { | if (Lanes[j] < 0) { | ||||

// First entry we've seen for this lane. | // First entry we've seen for this lane. | ||||

Lanes[j] = Mask[i] / LaneSize; | Lanes[j] = Mask[i] / LaneSize; | ||||

} else if (Lanes[j] != Mask[i] / LaneSize) { | } else if (Lanes[j] != Mask[i] / LaneSize) { | ||||

// This doesn't match the lane selected previously! | // This doesn't match the lane selected previously! | ||||

return SDValue(); | return SDValue(); | ||||

} | } | ||||

Not Done ReplyInline Actionsfor (int i = 0, e = M1.size(); i != e; ++i) RKSimon: for (int i = 0, e = M1.size(); i != e; ++i) | |||||

// Check that within each lane we have a consistent shuffle mask. | // Check that within each lane we have a consistent shuffle mask. | ||||

int k = i % LaneSize; | int k = i % LaneSize; | ||||

if (InLaneMask[k] < 0) { | if (InLaneMask[k] < 0) { | ||||

InLaneMask[k] = Mask[i] % LaneSize; | InLaneMask[k] = Mask[i] % LaneSize; | ||||

} else if (InLaneMask[k] != Mask[i] % LaneSize) { | } else if (InLaneMask[k] != Mask[i] % LaneSize) { | ||||

// This doesn't fit a repeating in-lane mask. | // This doesn't fit a repeating in-lane mask. | ||||

return SDValue(); | return SDValue(); | ||||

} | } | ||||

Not Done ReplyInline Actionsfor (int i = 0, e = MergedMask.size(); i != e; ++i) { RKSimon: for (int i = 0, e = MergedMask.size(); i != e; ++i) { | |||||

} | } | ||||

// First shuffle the lanes into place. | // First shuffle the lanes into place. | ||||

MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, | MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, | ||||

VT.getSizeInBits() / 64); | VT.getSizeInBits() / 64); | ||||

SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); | SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); | ||||

for (int i = 0; i < NumLanes; ++i) | for (int i = 0; i < NumLanes; ++i) | ||||

if (Lanes[i] >= 0) { | if (Lanes[i] >= 0) { | ||||

▲ Show 20 Lines • Show All 468 Lines • ▼ Show 20 Lines | if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, | ||||

return V; | return V; | ||||

// If we have AVX2 then we always want to lower with a blend because an v4 we | // If we have AVX2 then we always want to lower with a blend because an v4 we | ||||

// can fully permute the elements. | // can fully permute the elements. | ||||

if (Subtarget.hasAVX2()) | if (Subtarget.hasAVX2()) | ||||

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, | return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, | ||||

Mask, DAG); | Mask, DAG); | ||||

// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||

// lane comes from V2 and the lanes do the same operation. We can create | |||||

// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||

// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||

// repeated lane shuffle. | |||||

if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v4f64, V1, V2, | |||||

Mask, DAG)) | |||||

return V; | |||||

// Otherwise fall back on generic lowering. | // Otherwise fall back on generic lowering. | ||||

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); | return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); | ||||

} | } | ||||

/// \brief Handle lowering of 4-lane 64-bit integer shuffles. | /// \brief Handle lowering of 4-lane 64-bit integer shuffles. | ||||

/// | /// | ||||

/// This routine is only called when we have AVX2 and thus a reasonable | /// This routine is only called when we have AVX2 and thus a reasonable | ||||

/// instruction set for v4i64 shuffling.. | /// instruction set for v4i64 shuffling.. | ||||

▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines | if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, | ||||

return V; | return V; | ||||

// If we have AVX2 then we always want to lower with a blend because at v8 we | // If we have AVX2 then we always want to lower with a blend because at v8 we | ||||

// can fully permute the elements. | // can fully permute the elements. | ||||

if (Subtarget.hasAVX2()) | if (Subtarget.hasAVX2()) | ||||

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, | return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, | ||||

Mask, DAG); | Mask, DAG); | ||||

// Attempt to lower a shuffle where one lane comes from V1 and the other | |||||

// lane comes from V2 and the lanes do the same operation. We can create | |||||

// a new V1 with the lower lane of V1 and the lower lane of V2. And a new | |||||

// V2 with the upper lane of V1 and the upper lane of V2 and then do a | |||||

// repeated lane shuffle. | |||||

if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2, | |||||

Mask, DAG)) | |||||

return V; | |||||

// Otherwise fall back on generic lowering. | // Otherwise fall back on generic lowering. | ||||

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); | return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); | ||||

} | } | ||||

/// \brief Handle lowering of 8-lane 32-bit integer shuffles. | /// \brief Handle lowering of 8-lane 32-bit integer shuffles. | ||||

/// | /// | ||||

/// This routine is only called when we have AVX2 and thus a reasonable | /// This routine is only called when we have AVX2 and thus a reasonable | ||||

/// instruction set for v8i32 shuffling.. | /// instruction set for v8i32 shuffling.. | ||||

▲ Show 20 Lines • Show All 25,417 Lines • Show Last 20 Lines |

Should't RepeatMask be just LaneSize wide?