diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -199,11 +199,6 @@ ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false); - InstructionCost getInterleavedMemoryOpCostAVX2( - unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, - ArrayRef Indices, Align Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, bool UseMaskForCond = false, - bool UseMaskForGaps = false); InstructionCost getIntImmCost(int64_t); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5021,12 +5021,6 @@ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { - - if (UseMaskForCond || UseMaskForGaps) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, CostKind, - UseMaskForCond, UseMaskForGaps); - // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have // VecTy = <12 x i32>. @@ -5140,29 +5134,38 @@ return Cost; } -// Get estimation for interleaved load/store operations for AVX2. -// \p Factor is the interleaved-access factor (stride) - number of -// (interleaved) elements in the group. -// \p Indices contains the indices for a strided load: when the -// interleaved load has gaps they indicate which elements are used. -// If Indices is empty (or if the number of indices is equal to the size -// of the interleaved-access as given in \p Factor) the access has no gaps. -// -// As opposed to AVX-512, AVX2 does not have generic shuffles that allow -// computing the cost using a generic formula as a function of generic -// shuffles. We therefore use a lookup table instead, filled according to -// the instruction sequences that codegen currently generates. -InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2( - unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, - ArrayRef Indices, Align Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { - +InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) { + auto *VecTy = cast(BaseTy); if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); - // We currently Support only fully-interleaved groups, with no gaps. + auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { + Type *EltTy = cast(VecTy)->getElementType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || + EltTy->isIntegerTy(32) || EltTy->isPointerTy()) + return true; + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || + (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) + return HasBW; + return false; + }; + if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) + return getInterleavedMemoryOpCostAVX512( + Opcode, VecTy, Factor, Indices, Alignment, + AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); + + // Get estimation for interleaved load/store operations for SSE-AVX2. + // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow + // computing the cost using a generic formula as a function of generic + // shuffles. We therefore use a lookup table instead, filled according to + // the instruction sequences that codegen currently generates. + + // We currently support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). if (Indices.size() && Indices.size() != Factor) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, @@ -5208,86 +5211,86 @@ {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 - {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 - {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 + {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 + {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 - {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 - {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 - {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 - {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 + {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 + {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 + {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 + {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 - {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 - {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 - {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 - {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 + {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 + {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 + {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 + {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 - {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 - {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 - {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 + {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 + {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 + {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 - {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 + {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 - {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 - {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 - {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 - {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 - {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 + {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 + {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 + {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 + {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 + {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 - {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 - {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 - {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 + {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 + {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 + {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 - {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 - {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 + {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 + {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 - {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 + {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 - {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 + {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 - {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 - {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 - {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 - {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 + {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 + {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 + {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 + {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 - {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 - {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 - {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 + {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 + {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 + {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 - {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 - {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 + {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 + {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 - {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 - {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 - {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 + {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 + {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 + {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 - {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 - {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 - {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 + {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 + {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 + {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 - {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 - {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 - {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 + {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 + {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 + {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 - {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 + {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 @@ -5295,133 +5298,108 @@ }; static const CostTblEntry AVX2InterleavedStoreTbl[] = { - {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) - {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) - {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) + {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) + {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) + {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) - {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) - {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) - {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) + {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) + {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) + {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) - {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) - {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) - {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) - {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) + {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) + {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) + {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) + {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) - {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) - {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) - {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) + {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) + {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) + {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) - {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) + {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) - {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) - {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) - {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) + {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) + {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) + {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) - {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) - {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) + {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) + {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) - {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) - {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) + {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) + {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) - {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) + {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) - {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) - {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) + {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) + {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) - {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) - {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) + {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) + {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) - {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) - {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) + {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) + {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) - {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) - {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) + {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) + {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) - {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) + {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) - {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) + {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) - {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) - {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) + {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) + {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) - {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) - {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) + {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) + {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) - {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) + {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) - {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) + {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) - {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) + {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) - {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) - {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) + {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) + {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) }; if (Opcode == Instruction::Load) { - if (const auto *Entry = - CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, + ETy.getSimpleVT())) + return MemOpCosts + Entry->Cost; } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); - if (const auto *Entry = - CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, + ETy.getSimpleVT())) + return MemOpCosts + Entry->Cost; } - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, CostKind); -} - -InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, - bool UseMaskForCond, bool UseMaskForGaps) { - auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { - Type *EltTy = cast(VecTy)->getElementType(); - if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || - EltTy->isIntegerTy(32) || EltTy->isPointerTy()) - return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || - (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) - return HasBW; - return false; - }; - if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) - return getInterleavedMemoryOpCostAVX512( - Opcode, cast(VecTy), Factor, Indices, Alignment, - AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); - if (ST->hasAVX2()) - return getInterleavedMemoryOpCostAVX2( - Opcode, cast(VecTy), Factor, Indices, Alignment, - AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); - - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + return BaseT::getInterleavedMemoryOpCost(Opcode, BaseTy, Factor, Indices, Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); }