diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1255,8 +1255,9 @@ // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. - InstructionCost InsSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false); + InstructionCost InsSubCost = thisT()->getScalarizationOverhead( + SubVT, APInt::getAllOnes(NumSubElts), + /*Insert*/ true, /*Extract*/ false); Cost += Indices.size() * InsSubCost; Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, @@ -1275,8 +1276,9 @@ // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. - InstructionCost ExtSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); + InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( + SubVT, APInt::getAllOnes(NumSubElts), + /*Insert*/ false, /*Extract*/ true); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, @@ -1300,9 +1302,13 @@ // The cost is estimated as extract all mask elements from the <8xi1> mask // vector and insert them factor times into the <24xi1> shuffled mask // vector. - Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); Cost += - getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false); + thisT()->getScalarizationOverhead(SubVT, APInt::getAllOnes(NumSubElts), + /*Insert*/ false, /*Extract*/ true); + Cost += thisT()->getScalarizationOverhead( + MaskVT, + UseMaskForGaps ? DemandedLoadStoreElts : APInt::getAllOnes(NumElts), + /*Insert*/ true, /*Extract*/ false); // The Gaps mask is invariant and created outside the loop, therefore the // cost of creating it is not accounted for here. However if we have both