diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1239,6 +1239,9 @@ assert(Indices.size() <= Factor && "Interleaved memory op has too many members"); + const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); + const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); + APInt DemandedLoadStoreElts = APInt::getZero(NumElts); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); @@ -1256,7 +1259,8 @@ // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. InstructionCost InsSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ true, /*Extract*/ false); Cost += Indices.size() * InsSubCost; Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, @@ -1276,7 +1280,8 @@ // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. InstructionCost ExtSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, @@ -1300,9 +1305,12 @@ // The cost is estimated as extract all mask elements from the <8xi1> mask // vector and insert them factor times into the <24xi1> shuffled mask // vector. - Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); Cost += - getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true); + Cost += thisT()->getScalarizationOverhead( + MaskVT, UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, + /*Insert*/ true, /*Extract*/ false); // The Gaps mask is invariant and created outside the loop, therefore the // cost of creating it is not accounted for here. However if we have both diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll @@ -107,16 +107,16 @@ ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 41 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 83 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 181 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 152 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) { entry: