diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5205,6 +5205,7 @@ Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); // Get the cost of all the memory operations. + // FIXME: discount dead loads. InstructionCost MemOpCosts = getMemoryOpCost( Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); @@ -5424,22 +5425,27 @@ }; if (Opcode == Instruction::Load) { - // FIXME: if we have a partially-interleaved groups, with gaps, - // should we discount the not-demanded indicies? + auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), + MemOpCosts](const CostTblEntry *Entry) { + // NOTE: this is just an approximation! + // It can over/under -estimate the cost! + return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); + }; + if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll @@ -13,24 +13,24 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; SSE2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 188 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 23 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 67 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 192 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 33 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4