diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -8,7 +8,9 @@ #include "RISCVTargetTransformInfo.h" #include "MCTargetDesc/RISCVMatInt.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" @@ -323,6 +325,31 @@ return LT.first * getLMULCost(LT.second); } + if (isa(Tp) && Kind == TTI::SK_PermuteSingleSrc && + Mask.size() >= 2) { + std::pair LT = getTypeLegalizationCost(Tp); + MVT EltTp = LT.second.getVectorElementType(); + // If the size of the element is < ELEN then shuffles of interleaves and + // deinterleaves of 2 vectors can be lowered into the following sequences + if (EltTp.getScalarSizeInBits() < ST->getELEN()) { + auto InterleaveMask = createInterleaveMask(Mask.size() / 2, 2); + auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); + // Example sequence: + // vsetivli zero, 4, e8, mf4, ta, ma (ignored) + // vwaddu.vv v10, v8, v9 + // li a0, -1 + // vwmaccu.vx v10, a0, v9 + // vmv1r.v v8, v10 + if (equal(InterleaveMask, Mask)) + return 4 * LT.first * getLMULCost(LT.second); + + // Example sequence: + // vnsrl.wi v10, v8, 0 + if (equal(DeinterleaveMask, Mask)) + return LT.first * getLMULCost(LT.second); + } + } + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s -check-prefixes=CHECK,RV64 +define <8 x i8> @interleave2_v8i8(<4 x i8> %v0, <4 x i8> %v1) { +; CHECK-LABEL: 'interleave2_v8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %concat, <8 x i8> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res +; + %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> + %res = shufflevector <8 x i8> %concat, <8 x i8> poison, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i32> @interleave2_v8i32(<4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: 'interleave2_v8i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = shufflevector <8 x i32> %concat, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %res +; + %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %res = shufflevector <8 x i32> %concat, <8 x i32> poison, <8 x i32> + ret <8 x i32> %res +} + +; Should be expensive on RV32 because it can't widen +define <8 x i64> @interleave2_v8i64(<4 x i64> %v0, <4 x i64> %v1) { +; RV32-LABEL: 'interleave2_v8i64' +; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; +; RV64-LABEL: 'interleave2_v8i64' +; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; + %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> + %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> + ret <8 x i64> %res +} + +; TODO: getInstructionCost doesn't call getShuffleCost here because the shuffle changes length +define {<4 x i8>, <4 x i8>} @deinterleave_2(<8 x i8> %v) { +; CHECK-LABEL: 'deinterleave_2' +; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %v0 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %v1 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %res0 = insertvalue { <4 x i8>, <4 x i8> } poison, <4 x i8> %v0, 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %res1 = insertvalue { <4 x i8>, <4 x i8> } %res0, <4 x i8> %v1, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret { <4 x i8>, <4 x i8> } %res1 +; + %v0 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> + %v1 = shufflevector <8 x i8> %v, <8 x i8> poison, <4 x i32> + %res0 = insertvalue {<4 x i8>, <4 x i8>} poison, <4 x i8> %v0, 0 + %res1 = insertvalue {<4 x i8>, <4 x i8>} %res0, <4 x i8> %v1, 1 + ret {<4 x i8>, <4 x i8>} %res1 +}