diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1246,21 +1246,23 @@ // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. + APInt ExtractedElts(NumElts, 0); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); // Extract elements from loaded vector for each sub vector. for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, - Index + Elm * Factor); + ExtractedElts.setBit(Index + Elm * Factor); } + Cost += thisT()->getScalarizationOverhead( + VT, ExtractedElts, /*Insert=*/false, /*Extract=*/true); - InstructionCost InsSubCost = 0; + APInt InsertedElts(NumSubElts, 0); for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - InsSubCost += - thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm); - - Cost += Indices.size() * InsSubCost; + InsertedElts.setBit(Elm); + Cost += Indices.size() * + thisT()->getScalarizationOverhead( + SubVT, InsertedElts, /*Insert=*/true, /*Extract=*/false); } else { // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. @@ -1275,20 +1277,23 @@ // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. - InstructionCost ExtSubCost = 0; + APInt ExtractedElts(NumSubElts, 0); for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement, - SubVT, Elm); - Cost += ExtSubCost * Indices.size(); + ExtractedElts.setBit(Elm); + Cost += Indices.size() * + thisT()->getScalarizationOverhead( + SubVT, ExtractedElts, /*Insert=*/false, /*Extract=*/true); + APInt InsertedElts(NumElts, 0); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); - // Insert elements from loaded vector for each sub vector. + // Extract elements from loaded vector for each sub vector. for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT, - Index + Elm * Factor); + InsertedElts.setBit(Index + Elm * Factor); } + Cost += thisT()->getScalarizationOverhead( + VT, InsertedElts, /*Insert=*/true, /*Extract=*/false); } if (!UseMaskForCond) diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -27,7 +27,7 @@ ; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll @@ -13,17 +13,17 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 160 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 -; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 +; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll @@ -13,18 +13,18 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 27 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 79 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 238 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX1: LV: Found an estimated cost of 97 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 - +; AVX1: LV: Found an estimated cost of 67 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 +; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 +; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1