diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11593,23 +11593,8 @@ if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { - unsigned PFIndexes[4]; - for (unsigned i = 0; i != 4; ++i) { - if (M[i] < 0) - PFIndexes[i] = 8; - else - PFIndexes[i] = M[i]; - } - - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + - PFIndexes[2] * 9 + PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - // The cost tables encode cost 0 or cost 1 shuffles using the value 0 in - // the top 2 bits. - if (Cost == 0) + unsigned Cost = getPerfectShuffleCost(M); + if (Cost <= 1) return true; } diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H +#include "llvm/ADT/ArrayRef.h" + // 31 entries have cost 0 // 730 entries have cost 1 // 3658 entries have cost 2 @@ -6584,4 +6586,36 @@ 835584U, // : Cost 0 copy LHS 0}; +static unsigned getPerfectShuffleCost(llvm::ArrayRef M) { + assert(M.size() == 4 && "Expected a 4 entry perfect shuffle"); + + // Special case zero-cost nop copies, from either LHS or RHS. + if (llvm::all_of(llvm::enumerate(M), [](auto &E) { + return E.value() < 0 || E.value() == (int)E.index(); + })) + return 0; + if (llvm::all_of(llvm::enumerate(M), [](auto &E) { + return E.value() < 0 || E.value() == (int)E.index() + 4; + })) + return 0; + + // Get the four mask elementd from the 2 inputs. Perfect shuffles encode undef + // elements with value 8. + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + assert(M[i] < 8 && "Expected a maximum entry of 8 for shuffle mask"); + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + // And extract the cost from the upper bits. The cost is encoded as Cost-1. + return (PFEntry >> 30) + 1; +} + #endif diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -8,6 +8,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" +#include "AArch64PerfectShuffle.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" @@ -2597,19 +2598,27 @@ ArrayRef Args) { Kind = improveShuffleKindFromMask(Kind, Mask); std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + // Check for broadcast loads. + if (Kind == TTI::SK_Broadcast) { + bool IsLoad = !Args.empty() && isa(Args[0]); + if (IsLoad && LT.second.isVector() && + isLegalBroadcastLoad(Tp->getElementType(), + LT.second.getVectorElementCount())) + return 0; // broadcast is handled by ld1r + } + + // If we have 4 elements for the shuffle and a Mask, get the cost straight + // from the perfect shuffle tables. + if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && + (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && + all_of(Mask, [](int E) { return E < 8; })) + return getPerfectShuffleCost(Mask); + if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { - // Check for broadcast loads. - if (Kind == TTI::SK_Broadcast) { - bool IsLoad = !Args.empty() && isa(Args[0]); - if (IsLoad && LT.second.isVector() && - isLegalBroadcastLoad(Tp->getElementType(), - LT.second.getVectorElementCount())) - return 0; // broadcast is handled by ld1r - } - static const CostTblEntry ShuffleTbl[] = { // Broadcast shuffle kinds can be performed with 'dup'. { TTI::SK_Broadcast, MVT::v8i8, 1 }, diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -8,12 +8,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> @@ -39,16 +39,16 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -92,8 +92,8 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> @@ -105,7 +105,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll @@ -20,7 +20,7 @@ } ; COST-LABEL: sel.v4i16 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; COST: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; CODE-LABEL: sel.v4i16 ; CODE: rev32 v0.4h, v0.4h ; CODE: trn2 v0.4h, v0.4h, v1.4h diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -308,16 +308,10 @@ define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @reverse_hadd_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1