Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14957,14 +14957,18 @@ // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) return; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (Ld->isVolatile() || Ld->isIndexed()) return; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { + // The memory operands must not be volatile/indexed. if (Other->isVolatile() || Other->isIndexed()) return false; + // Don't mix temporal stores with non-temporal stores. + if (St->isNonTemporal() != Other->isNonTemporal()) + return false; SDValue Val = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) @@ -15139,6 +15143,7 @@ isa(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsNonTemporalStore = St->isNonTemporal(); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; @@ -15582,26 +15587,30 @@ SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); - MachineMemOperand::Flags MMOFlags = + MachineMemOperand::Flags LdMMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable : MachineMemOperand::MONone; + MachineMemOperand::Flags StMMOFlags = + IsNonTemporalStore ? MachineMemOperand::MONonTemporal + : MachineMemOperand::MONone; + SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), Index: test/CodeGen/X86/merge-consecutive-stores-nt.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -8,8 +8,7 @@ ; PR42123 ; -; FIXME: AVX doesn't retain NT flag on store. -; Should be VMOVNTPS ymm. +; AVX should be VMOVNTPS ymm. define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32: ; X86: # %bb.0: @@ -32,7 +31,7 @@ ; X64-AVX-LABEL: merge_2_v4f32_align32: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) +; X64-AVX-NEXT: vmovntps %ymm0, (%rsi) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 @@ -46,8 +45,7 @@ ret void } -; FIXME: shouldn't attempt to merge nt and non-nt stores even if aligned. -; Must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm. +; AVX must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm. define void @merge_2_v4f32_align32_mix(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32_mix: ; X86: # %bb.0: @@ -69,9 +67,10 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>*