diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2430,6 +2430,33 @@ } } + /// Return if this shuffle interleaves its two input vectors together. + bool isInterleave(unsigned Factor); + + /// Return true if the mask interleaves one or more input vectors together. + /// + /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> + /// E.g. For a Factor of 2 (LaneLen=4): + /// <0, 4, 1, 5, 2, 6, 3, 7> + /// E.g. For a Factor of 3 (LaneLen=4): + /// <4, 0, 9, 5, 1, 10, 6, 2, 11, 7, 3, 12> + /// E.g. For a Factor of 4 (LaneLen=2): + /// <0, 2, 6, 4, 1, 3, 7, 5> + /// + /// NumInputElts is the total number of elements in the input vectors. + /// + /// StartIndexes are the first indexes of each vector being interleaved, + /// substituting any indexes that were undef + /// E.g. <4, -1, 2, 5, 1, 3> (Factor=3): StartIndexes=<4, 0, 2> + static bool isInterleaveMask(ArrayRef Mask, unsigned Factor, + unsigned NumInputElts, + SmallVectorImpl &StartIndexes); + static bool isInterleaveMask(ArrayRef Mask, unsigned Factor, + unsigned NumInputElts) { + SmallVector StartIndexes; + return isInterleaveMask(Mask, Factor, NumInputElts, StartIndexes); + } + // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Instruction *I) { return I->getOpcode() == Instruction::ShuffleVector; diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -202,86 +202,15 @@ /// The particular case of an RE-interleave mask is: /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> -static bool isReInterleaveMask(ArrayRef Mask, unsigned &Factor, - unsigned MaxFactor, unsigned OpNumElts) { - unsigned NumElts = Mask.size(); +static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, + unsigned MaxFactor) { + unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; // Check potential Factors. for (Factor = 2; Factor <= MaxFactor; Factor++) { - if (NumElts % Factor) - continue; - - unsigned LaneLen = NumElts / Factor; - if (!isPowerOf2_32(LaneLen)) - continue; - - // Check whether each element matches the general interleaved rule. - // Ignore undef elements, as long as the defined elements match the rule. - // Outer loop processes all factors (x, y, z in the above example) - unsigned I = 0, J; - for (; I < Factor; I++) { - unsigned SavedLaneValue; - unsigned SavedNoUndefs = 0; - - // Inner loop processes consecutive accesses (x, x+1... in the example) - for (J = 0; J < LaneLen - 1; J++) { - // Lane computes x's position in the Mask - unsigned Lane = J * Factor + I; - unsigned NextLane = Lane + Factor; - int LaneValue = Mask[Lane]; - int NextLaneValue = Mask[NextLane]; - - // If both are defined, values must be sequential - if (LaneValue >= 0 && NextLaneValue >= 0 && - LaneValue + 1 != NextLaneValue) - break; - - // If the next value is undef, save the current one as reference - if (LaneValue >= 0 && NextLaneValue < 0) { - SavedLaneValue = LaneValue; - SavedNoUndefs = 1; - } - - // Undefs are allowed, but defined elements must still be consecutive: - // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, .... - // Verify this by storing the last non-undef followed by an undef - // Check that following non-undef masks are incremented with the - // corresponding distance. - if (SavedNoUndefs > 0 && LaneValue < 0) { - SavedNoUndefs++; - if (NextLaneValue >= 0 && - SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue) - break; - } - } - - if (J < LaneLen - 1) - break; - - int StartMask = 0; - if (Mask[I] >= 0) { - // Check that the start of the I range (J=0) is greater than 0 - StartMask = Mask[I]; - } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) { - // StartMask defined by the last value in lane - StartMask = Mask[(LaneLen - 1) * Factor + I] - J; - } else if (SavedNoUndefs > 0) { - // StartMask defined by some non-zero value in the j loop - StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs); - } - // else StartMask remains set to 0, i.e. all elements are undefs - - if (StartMask < 0) - break; - // We must stay within the vectors; This case can happen with undefs. - if (StartMask + LaneLen > OpNumElts*2) - break; - } - - // Found an interleaved mask of current factor. - if (I == Factor) + if (SVI->isInterleave(Factor)) return true; } @@ -500,9 +429,7 @@ // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - unsigned OpNumElts = - cast(SVI->getOperand(0)->getType())->getNumElements(); - if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts)) + if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2728,6 +2728,98 @@ return isOneUseSingleSourceMask(ShuffleMask, VF); } +bool ShuffleVectorInst::isInterleave(unsigned Factor) { + FixedVectorType *OpTy = dyn_cast(getOperand(0)->getType()); + // shuffle_vector can only interleave fixed length vectors - for scalable + // vectors, see the @llvm.experimental.vector.interleave2 intrinsic + if (!OpTy) + return false; + unsigned OpNumElts = OpTy->getNumElements(); + + return isInterleaveMask(ShuffleMask, Factor, OpNumElts * 2); +} + +bool ShuffleVectorInst::isInterleaveMask( + ArrayRef Mask, unsigned Factor, unsigned NumInputElts, + SmallVectorImpl &StartIndexes) { + unsigned NumElts = Mask.size(); + if (NumElts % Factor) + return false; + + unsigned LaneLen = NumElts / Factor; + if (!isPowerOf2_32(LaneLen)) + return false; + + StartIndexes.resize(Factor); + + // Check whether each element matches the general interleaved rule. + // Ignore undef elements, as long as the defined elements match the rule. + // Outer loop processes all factors (x, y, z in the above example) + unsigned I = 0, J; + for (; I < Factor; I++) { + unsigned SavedLaneValue; + unsigned SavedNoUndefs = 0; + + // Inner loop processes consecutive accesses (x, x+1... in the example) + for (J = 0; J < LaneLen - 1; J++) { + // Lane computes x's position in the Mask + unsigned Lane = J * Factor + I; + unsigned NextLane = Lane + Factor; + int LaneValue = Mask[Lane]; + int NextLaneValue = Mask[NextLane]; + + // If both are defined, values must be sequential + if (LaneValue >= 0 && NextLaneValue >= 0 && + LaneValue + 1 != NextLaneValue) + break; + + // If the next value is undef, save the current one as reference + if (LaneValue >= 0 && NextLaneValue < 0) { + SavedLaneValue = LaneValue; + SavedNoUndefs = 1; + } + + // Undefs are allowed, but defined elements must still be consecutive: + // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, .... + // Verify this by storing the last non-undef followed by an undef + // Check that following non-undef masks are incremented with the + // corresponding distance. + if (SavedNoUndefs > 0 && LaneValue < 0) { + SavedNoUndefs++; + if (NextLaneValue >= 0 && + SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue) + break; + } + } + + if (J < LaneLen - 1) + return false; + + int StartMask = 0; + if (Mask[I] >= 0) { + // Check that the start of the I range (J=0) is greater than 0 + StartMask = Mask[I]; + } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) { + // StartMask defined by the last value in lane + StartMask = Mask[(LaneLen - 1) * Factor + I] - J; + } else if (SavedNoUndefs > 0) { + // StartMask defined by some non-zero value in the j loop + StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs); + } + // else StartMask remains set to 0, i.e. all elements are undefs + + if (StartMask < 0) + return false; + // We must stay within the vectors; This case can happen with undefs. + if (StartMask + LaneLen > NumInputElts) + return false; + + StartIndexes[I] = StartMask; + } + + return true; +} + //===----------------------------------------------------------------------===// // InsertValueInst Class //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" @@ -3062,46 +3063,19 @@ return false; int Size = Mask.size(); - int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - int Srcs[] = {-1, -1}; - for (int i = 0; i != Size; ++i) { - // Ignore undef elements. - if (Mask[i] < 0) - continue; - - // Is this an even or odd element. - int Pol = i % 2; + SmallVector StartIndexes; + if (!ShuffleVectorInst::isInterleaveMask(Mask, 2, Size * 2, StartIndexes)) + return false; - // Ensure we consistently use the same half source for this polarity. - int Src = alignDown(Mask[i], HalfSize); - if (Srcs[Pol] < 0) - Srcs[Pol] = Src; - if (Srcs[Pol] != Src) - return false; - - // Make sure the element within the source is appropriate for this element - // in the destination. - int Elt = Mask[i] % HalfSize; - if (Elt != i / 2) - return false; - } + EvenSrc = StartIndexes[0] % 2 ? StartIndexes[1] : StartIndexes[0]; + OddSrc = StartIndexes[0] % 2 ? StartIndexes[0] : StartIndexes[1]; // One source should be low half of first vector. - if (Srcs[0] != 0 && Srcs[1] != 0) + if (EvenSrc != 0 && OddSrc != 0) return false; - // Other source should be the upper half of the first source or the lower - // half of the second source. - // FIXME: This is only a heuristic to avoid regressions. - if (Srcs[0] != HalfSize && Srcs[0] != Size && Srcs[1] != HalfSize && - Srcs[1] != Size) - return false; - - EvenSrc = Srcs[0]; - OddSrc = Srcs[1]; - return true; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -10,10 +10,10 @@ #include "MCTargetDesc/RISCVMatInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/Instructions.h" #include #include using namespace llvm; @@ -261,16 +261,17 @@ if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { MVT EltTp = LT.second.getVectorElementType(); // If the size of the element is < ELEN then shuffles of interleaves and - // deinterleaves of 2 vectors can be lowered into the following sequences + // deinterleaves of 2 vectors can be lowered into the following + // sequences if (EltTp.getScalarSizeInBits() < ST->getELEN()) { auto InterleaveMask = createInterleaveMask(Mask.size() / 2, 2); // Example sequence: - // vsetivli zero, 4, e8, mf4, ta, ma (ignored) + // vsetivli zero, 4, e8, mf4, ta, ma (ignored) // vwaddu.vv v10, v8, v9 // li a0, -1 (ignored) // vwmaccu.vx v10, a0, v9 - if (equal(InterleaveMask, Mask)) - return 2 * LT.first * getLMULCost(LT.second); + if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size() * 2)) + return 2 * LT.first * getLMULCost(LT.second); if (Mask[0] == 0 || Mask[0] == 1) { auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll @@ -1,6 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV32 ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV64 + +; The mask here interleaves (%v1, %v0), not (%v0, %v1): it should still be cheap. +define <4 x i8> @interleave2_v2i8(<2 x i8> %v0, <2 x i8> %v1) { +; CHECK-LABEL: 'interleave2_v2i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = shufflevector <4 x i8> %concat, <4 x i8> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i8> %res +; + %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> + %res = shufflevector <4 x i8> %concat, <4 x i8> poison, <4 x i32> + ret <4 x i8> %res +} + define <8 x i8> @interleave2_v8i8(<4 x i8> %v0, <4 x i8> %v1) { ; CHECK-LABEL: 'interleave2_v8i8' ; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> diff --git a/llvm/unittests/IR/ShuffleVectorInstTest.cpp b/llvm/unittests/IR/ShuffleVectorInstTest.cpp --- a/llvm/unittests/IR/ShuffleVectorInstTest.cpp +++ b/llvm/unittests/IR/ShuffleVectorInstTest.cpp @@ -116,4 +116,33 @@ ShuffleVectorInst::isOneUseSingleSourceMask({0, 1, 2, 3, 3, 3, 1, 0}, 4)); } +TEST(ShuffleVectorInst, isInterleaveMask) { + SmallVector StartIndexes; + ASSERT_TRUE(ShuffleVectorInst::isInterleaveMask({0, 4, 1, 5, 2, 6, 3, 7}, 2, + 8, StartIndexes)); + ASSERT_EQ(StartIndexes, SmallVector({0, 4})); + + ASSERT_FALSE( + ShuffleVectorInst::isInterleaveMask({0, 4, 1, 6, 2, 6, 3, 7}, 2, 8)); + + ASSERT_TRUE(ShuffleVectorInst::isInterleaveMask({4, 0, 5, 1, 6, 2, 7, 3}, 2, + 8, StartIndexes)); + ASSERT_EQ(StartIndexes, SmallVector({4, 0})); + + ASSERT_TRUE(ShuffleVectorInst::isInterleaveMask({4, 0, -1, 1, -1, 2, 7, 3}, 2, + 8, StartIndexes)); + ASSERT_EQ(StartIndexes, SmallVector({4, 0})); + + ASSERT_TRUE(ShuffleVectorInst::isInterleaveMask({0, 2, 4, 1, 3, 5}, 3, 6, + StartIndexes)); + ASSERT_EQ(StartIndexes, SmallVector({0, 2, 4})); + + ASSERT_TRUE(ShuffleVectorInst::isInterleaveMask({4, -1, 0, 5, 3, 1}, 3, 6, + StartIndexes)); + ASSERT_EQ(StartIndexes, SmallVector({4, 2, 0})); + + ASSERT_FALSE( + ShuffleVectorInst::isInterleaveMask({8, 2, 12, 4, 9, 3, 13, 5}, 4, 8)); +} + } // end anonymous namespace