Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -639,15 +639,16 @@ /// \brief The various kinds of shuffle patterns for vector queries. enum ShuffleKind { - SK_Broadcast, ///< Broadcast element 0 to all other elements. - SK_Reverse, ///< Reverse the order of the vector. - SK_Alternate, ///< Choose alternate elements from vector. - SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. - SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset. - SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one - ///< with any shuffle mask. - SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any - ///< shuffle mask. + SK_Broadcast, ///< Broadcast element 0 to all other elements. + SK_Reverse, ///< Reverse the order of the vector. + SK_Alternate, ///< Choose alternate elements from vector. + SK_Transpose, ///< Transpose two vectors. + SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. + SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset. + SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one + ///< with any shuffle mask. + SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any + ///< shuffle mask. }; /// \brief Additional information about an operand's possible values. Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -553,11 +553,15 @@ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc || - Kind == TTI::SK_PermuteSingleSrc) { + switch (Kind) { + case TTI::SK_Alternate: + case TTI::SK_Transpose: + case TTI::SK_PermuteSingleSrc: + case TTI::SK_PermuteTwoSrc: return getPermuteShuffleOverhead(Tp); + default: + return 1; } - return 1; } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -683,6 +683,72 @@ return isAlternate; } +static bool isTransposeVectorMask(ArrayRef Mask) { + // Transpose vector masks transpose a 2xn matrix. They read corresponding + // even- or odd-numbered vector elements from two n-dimentional source + // vectors and write each result into consectuive elements of an + // n-dimensional destination vector. Two shuffles are necessary to complete + // the transpose, one for the even elements and another for the odd elements. + // This description closely follows how the TRN1 and TRN2 AArch64 + // instructions operate. + // + // For example, a simple 2x2 matrix can be transposed with: + // + // ; Original matrix + // m0 = + // m1 = + // + // ; Transposed matrix + // t0 = = shufflevector m0, m1, <0, 2> + // t1 = = shufflevector m0, m1, <1, 3> + // + // For matrices having greater than n columns, the resulting nx2 transposed + // matrix is stored in two result vectors such that one vector contains + // elements from all the even-numbered rows and the other vector contains + // elements from all the odd-numbered rows. For example, a 2x4 matrix can be + // transposed with: + // + // ; Original matrix + // m0 = + // m1 = + // + // ; Transposed matrix + // t0 = = shufflevector m0, m1 <0, 4, 2, 6> + // t1 = = shufflevector m0, m1 <1, 5, 3, 7> + // + // The above explanation places limations on what valid transpose masks can + // look like. These limitations are defined by the checks below. + // + // 1. The number of elements in the mask must be a power of two. + if (!isPowerOf2_32(Mask.size())) + return false; + + // 2. The first element of the mask must be either a zero (for the + // even-numbered vector elements) or a one (for the odd-numbered vector + // elements). + if (Mask[0] != 0 && Mask[0] != 1) + return false; + + // 3. The difference between the first two elements must be equal to the + // number of elements in the mask. + if (Mask[1] - Mask[0] != (int)Mask.size()) + return false; + + // 4. The difference between consecutive even-numbered elements must be equal + // to two. + for (int I = 2; I < (int)Mask.size(); I += 2) + if (Mask[I] - Mask[I - 2] != 2) + return false; + + // 5. The difference between consecutive odd-numbered elements must also be + // equal to two. + for (int I = 3; I < (int)Mask.size(); I += 2) + if (Mask[I] - Mask[I - 2] != 2) + return false; + + return true; +} + static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) { TargetTransformInfo::OperandValueKind OpInfo = TargetTransformInfo::OK_AnyValue; @@ -1139,22 +1205,26 @@ if (NumVecElems == Mask.size()) { if (isReverseVectorMask(Mask)) - return getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, - 0, nullptr); + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Reverse, + VecTypOp0, 0, nullptr); if (isAlternateVectorMask(Mask)) - return getShuffleCost(TargetTransformInfo::SK_Alternate, - VecTypOp0, 0, nullptr); + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Alternate, + VecTypOp0, 0, nullptr); + + if (isTransposeVectorMask(Mask)) + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Transpose, + VecTypOp0, 0, nullptr); if (isZeroEltBroadcastVectorMask(Mask)) - return getShuffleCost(TargetTransformInfo::SK_Broadcast, - VecTypOp0, 0, nullptr); + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); if (isSingleSourceVectorMask(Mask)) - return getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - VecTypOp0, 0, nullptr); + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTypOp0, 0, nullptr); - return getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - VecTypOp0, 0, nullptr); + return TTIImpl->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + VecTypOp0, 0, nullptr); } return -1; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -169,6 +169,8 @@ int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -912,3 +912,30 @@ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); } + +int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + + // Transpose shuffle kinds can be performed with 'trn1/trn2' and 'zip1/zip2' + // instructions. + if (Kind == TTI::SK_Transpose) { + static const CostTblEntry TransposeTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + }; + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = + CostTableLookup(TransposeTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} Index: test/Analysis/CostModel/AArch64/shuffle-transpose.ll =================================================================== --- test/Analysis/CostModel/AArch64/shuffle-transpose.ll +++ test/Analysis/CostModel/AArch64/shuffle-transpose.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE ; COST-LABEL: trn1.v8i8 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CODE-LABEL: trn1.v8i8 ; CODE: trn1 v0.8b, v0.8b, v1.8b define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { @@ -11,7 +11,7 @@ } ; COST-LABEL: trn2.v8i8 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CODE-LABEL: trn2.v8i8 ; CODE: trn2 v0.8b, v0.8b, v1.8b define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { @@ -20,7 +20,7 @@ } ; COST-LABEL: trn1.v16i8 -; COST: Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CODE-LABEL: trn1.v16i8 ; CODE: trn1 v0.16b, v0.16b, v1.16b define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { @@ -29,7 +29,7 @@ } ; COST-LABEL: trn2.v16i8 -; COST: Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CODE-LABEL: trn2.v16i8 ; CODE: trn2 v0.16b, v0.16b, v1.16b define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { @@ -38,7 +38,7 @@ } ; COST-LABEL: trn1.v4i16 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; CODE-LABEL: trn1.v4i16 ; CODE: trn1 v0.4h, v0.4h, v1.4h define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { @@ -47,7 +47,7 @@ } ; COST-LABEL: trn2.v4i16 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; CODE-LABEL: trn2.v4i16 ; CODE: trn2 v0.4h, v0.4h, v1.4h define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { @@ -56,7 +56,7 @@ } ; COST-LABEL: trn1.v8i16 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CODE-LABEL: trn1.v8i16 ; CODE: trn1 v0.8h, v0.8h, v1.8h define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { @@ -65,7 +65,7 @@ } ; COST-LABEL: trn2.v8i16 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CODE-LABEL: trn2.v8i16 ; CODE: trn2 v0.8h, v0.8h, v1.8h define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { @@ -74,7 +74,7 @@ } ; COST-LABEL: trn1.v2i32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ; CODE-LABEL: trn1.v2i32 ; CODE: zip1 v0.2s, v0.2s, v1.2s define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { @@ -83,7 +83,7 @@ } ; COST-LABEL: trn2.v2i32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ; CODE-LABEL: trn2.v2i32 ; CODE: zip2 v0.2s, v0.2s, v1.2s define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { @@ -92,7 +92,7 @@ } ; COST-LABEL: trn1.v4i32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; CODE-LABEL: trn1.v4i32 ; CODE: trn1 v0.4s, v0.4s, v1.4s define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { @@ -101,7 +101,7 @@ } ; COST-LABEL: trn2.v4i32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; CODE-LABEL: trn2.v4i32 ; CODE: trn2 v0.4s, v0.4s, v1.4s define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { @@ -110,7 +110,7 @@ } ; COST-LABEL: trn1.v2i64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ; CODE-LABEL: trn1.v2i64 ; CODE: zip1 v0.2d, v0.2d, v1.2d define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) { @@ -119,7 +119,7 @@ } ; COST-LABEL: trn2.v2i64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ; CODE-LABEL: trn2.v2i64 ; CODE: zip2 v0.2d, v0.2d, v1.2d define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { @@ -128,7 +128,7 @@ } ; COST-LABEL: trn1.v2f32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ; CODE-LABEL: trn1.v2f32 ; CODE: zip1 v0.2s, v0.2s, v1.2s define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { @@ -137,7 +137,7 @@ } ; COST-LABEL: trn2.v2f32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ; CODE-LABEL: trn2.v2f32 ; CODE: zip2 v0.2s, v0.2s, v1.2s define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { @@ -146,7 +146,7 @@ } ; COST-LABEL: trn1.v4f32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; CODE-LABEL: trn1.v4f32 ; CODE: trn1 v0.4s, v0.4s, v1.4s define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { @@ -155,7 +155,7 @@ } ; COST-LABEL: trn2.v4f32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; CODE-LABEL: trn2.v4f32 ; CODE: trn2 v0.4s, v0.4s, v1.4s define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { @@ -164,7 +164,7 @@ } ; COST-LABEL: trn1.v2f64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ; CODE-LABEL: trn1.v2f64 ; CODE: zip1 v0.2d, v0.2d, v1.2d define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) { @@ -173,7 +173,7 @@ } ; COST-LABEL: trn2.v2f64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ; CODE-LABEL: trn2.v2f64 ; CODE: zip2 v0.2d, v0.2d, v1.2d define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {