diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5275,7 +5275,8 @@ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), LegalVT.getVectorNumElements()); InstructionCost MemOpCost; - if (UseMaskForCond || UseMaskForGaps) + bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; + if (UseMaskedMemOp) MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); else @@ -5286,7 +5287,7 @@ MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); InstructionCost MaskCost; - if (UseMaskForCond || UseMaskForGaps) { + if (UseMaskedMemOp) { APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); @@ -5349,9 +5350,10 @@ NumOfLoadsInInterleaveGrp; // About a half of the loads may be folded in shuffles when we have only - // one result. If we have more than one result, we do not fold loads at all. + // one result. If we have more than one result, or the loads are masked, + // we do not fold loads at all. unsigned NumOfUnfoldedLoads = - NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; // Get a number of shuffle operations per result. unsigned NumOfShufflesPerResult = diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll @@ -172,10 +172,10 @@ ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test" ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 13 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 7 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 define void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) { entry: