Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14957,7 +14957,7 @@ // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) return; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (Ld->isVolatile() || Ld->isIndexed()) return; } @@ -14965,6 +14965,9 @@ int64_t &Offset) -> bool { if (Other->isVolatile() || Other->isIndexed()) return false; + // Don't mix temporal stores with non-temporal stores. + if (St->isNonTemporal() != Other->isNonTemporal()) + return false; SDValue OtherBC = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) @@ -14974,15 +14977,18 @@ return false; // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast(OtherBC)) { - auto LPtr = BaseIndexOffset::match(OtherLd, DAG); + BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; // Loads must only have one use. if (!OtherLd->hasNUsesOfValue(1, 0)) return false; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (OtherLd->isVolatile() || OtherLd->isIndexed()) return false; + // Don't mix temporal loads with non-temporal loads. + if (cast(Val)->isNonTemporal() != OtherLd->isNonTemporal()) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else @@ -15139,6 +15145,9 @@ isa(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsNonTemporalStore = St->isNonTemporal(); + bool IsNonTemporalLoad = + IsLoadSrc && cast(StoredVal)->isNonTemporal(); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; @@ -15582,26 +15591,32 @@ SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); - MachineMemOperand::Flags MMOFlags = + MachineMemOperand::Flags LdMMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable : MachineMemOperand::MONone; + if (IsNonTemporalLoad) + LdMMOFlags |= MachineMemOperand::MONonTemporal; + + MachineMemOperand::Flags StMMOFlags = + IsNonTemporalStore ? MachineMemOperand::MONonTemporal + : MachineMemOperand::MONone; SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), Index: test/CodeGen/X86/merge-consecutive-stores-nt.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -10,10 +10,6 @@ ; PR42123 ; -; FIXME: AVX doesn't retain NT flag on load/store. -; AVX1 load should be 2 x VMOVNTDQA xmm. -; AVX2 load should be VMOVNTDQA ymm. -; AVX store should be VMOVNTPS ymm. define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32: ; X86: # %bb.0: @@ -49,12 +45,20 @@ ; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi) +; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 @@ -66,8 +70,7 @@ ret void } -; FIXME: shouldn't attempt to merge nt and non-nt loads even if aligned. -; Must be kept seperate as VMOVNTDQA xmm + VMOVDQA xmm. +; Don't merge nt and non-nt loads even if aligned. define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32_mix_ntload: ; X86: # %bb.0: @@ -105,9 +108,10 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -120,8 +124,7 @@ ret void } -; FIXME: shouldn't attempt to merge nt and non-nt stores even if aligned. -; Must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm. +; Don't merge nt and non-nt stores even if aligned. define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X86: # %bb.0: @@ -143,9 +146,10 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -158,7 +162,7 @@ ret void } -; FIXME: AVX can't perform NT-load-ymm on 16-byte aligned memory. +; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory. ; Must be kept seperate as VMOVNTDQA xmm. define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align16_ntload: @@ -195,12 +199,20 @@ ; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0