diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1983,6 +1983,13 @@ return false; } + /// Whether AtomicExpandPass should automatically insert a trailing fence + /// without reducing the ordering for this atomic. Defaults to false. + virtual bool + shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const { + return false; + } + /// Perform a load-linked operation on Addr, returning a "Value *" with the /// corresponding pointee type. This may entail some non-trivial operations to /// truncate or reconstruct types that will be illegal in the backend. See diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -289,6 +289,24 @@ if (FenceOrdering != AtomicOrdering::Monotonic) { MadeChange |= bracketInstWithFences(I, FenceOrdering); } + } else if (I->hasAtomicStore() && + TLI->shouldInsertTrailingFenceForAtomicStore(I)) { + auto FenceOrdering = AtomicOrdering::Monotonic; + if (SI) + FenceOrdering = SI->getOrdering(); + else if (RMWI) + FenceOrdering = RMWI->getOrdering(); + else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) != + TargetLoweringBase::AtomicExpansionKind::LLSC) + // LLSC is handled in expandAtomicCmpXchg(). + FenceOrdering = CASI->getSuccessOrdering(); + + IRBuilder Builder(I); + if (auto TrailingFence = + TLI->emitTrailingFence(Builder, I, FenceOrdering)) { + TrailingFence->moveAfter(I); + MadeChange = true; + } } if (LI) @@ -1356,7 +1374,8 @@ // Make sure later instructions don't get reordered with a fence if // necessary. Builder.SetInsertPoint(SuccessBB); - if (ShouldInsertFencesForAtomic) + if (ShouldInsertFencesForAtomic || + TLI->shouldInsertTrailingFenceForAtomicStore(CI)) TLI->emitTrailingFence(Builder, CI, SuccessOrder); Builder.CreateBr(ExitBB); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -706,6 +706,8 @@ bool isOpSuitableForLDPSTP(const Instruction *I) const; bool shouldInsertFencesForAtomic(const Instruction *I) const override; + bool + shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22300,6 +22300,30 @@ return isOpSuitableForLDPSTP(I); } +bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore( + const Instruction *I) const { + // Store-Release instructions only provide seq_cst guarantees when paired with + // Load-Acquire instructions. MSVC CRT does not use these instructions to + // implement seq_cst loads and stores, so we need additional explicit fences + // after memory writes. + if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return false; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::AtomicCmpXchg: + return cast(I)->getSuccessOrdering() == + AtomicOrdering::SequentiallyConsistent; + case Instruction::AtomicRMW: + return cast(I)->getOrdering() == + AtomicOrdering::SequentiallyConsistent; + case Instruction::Store: + return cast(I)->getOrdering() == + AtomicOrdering::SequentiallyConsistent; + } +} + // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll --- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: cbnz w11, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw add ptr @var8, i8 %offset seq_cst ret i8 %old @@ -135,16 +136,17 @@ define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_sub_i64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: adrp x9, var64 ; CHECK-NEXT: add x9, x9, :lo12:var64 ; CHECK-NEXT: .LBB7_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x8, [x9] -; CHECK-NEXT: sub x10, x8, x0 +; CHECK-NEXT: ldaxr x0, [x9] +; CHECK-NEXT: sub x10, x0, x8 ; CHECK-NEXT: stlxr w11, x10, [x9] ; CHECK-NEXT: cbnz w11, .LBB7_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw sub ptr @var64, i64 %offset seq_cst ret i64 %old @@ -199,6 +201,7 @@ ; CHECK-NEXT: cbnz w11, .LBB10_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw and ptr @var32, i32 %offset seq_cst ret i32 %old @@ -235,6 +238,7 @@ ; CHECK-NEXT: cbnz w11, .LBB12_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw or ptr @var8, i8 %offset seq_cst ret i8 %old @@ -343,6 +347,7 @@ ; CHECK-NEXT: cbnz w11, .LBB18_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw xor ptr @var32, i32 %offset seq_cst ret i32 %old @@ -397,6 +402,7 @@ ; CHECK-NEXT: cbnz w10, .LBB21_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw xchg ptr @var16, i16 %offset seq_cst ret i16 %old @@ -500,17 +506,18 @@ define dso_local i64 @test_atomic_load_min_i64(i64 %offset) nounwind { ; CHECK-LABEL: test_atomic_load_min_i64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: adrp x9, var64 ; CHECK-NEXT: add x9, x9, :lo12:var64 ; CHECK-NEXT: .LBB27_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxr x8, [x9] -; CHECK-NEXT: cmp x8, x0 -; CHECK-NEXT: csel x10, x8, x0, le +; CHECK-NEXT: ldaxr x0, [x9] +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x10, x0, x8, le ; CHECK-NEXT: stlxr w11, x10, [x9] ; CHECK-NEXT: cbnz w11, .LBB27_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw min ptr @var64, i64 %offset seq_cst ret i64 %old @@ -531,6 +538,7 @@ ; CHECK-NEXT: cbnz w11, .LBB28_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw max ptr @var8, i8 %offset seq_cst ret i8 %old @@ -648,6 +656,7 @@ ; CHECK-NEXT: cbnz w11, .LBB34_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw umin ptr @var32, i32 %offset seq_cst ret i32 %old @@ -726,6 +735,7 @@ ; CHECK-NEXT: cbnz w11, .LBB38_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret %old = atomicrmw umax ptr @var32, i32 %offset seq_cst ret i32 %old @@ -794,7 +804,8 @@ ; CHECK-NEXT: // in Loop: Header=BB41_1 Depth=1 ; CHECK-NEXT: stlxrh w10, w1, [x9] ; CHECK-NEXT: cbnz w10, .LBB41_1 -; CHECK-NEXT: // %bb.3: // %cmpxchg.end +; CHECK-NEXT: // %bb.3: // %cmpxchg.success +; CHECK-NEXT: dmb ish ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB41_4: // %cmpxchg.nostore @@ -972,6 +983,7 @@ ; CHECK-NEXT: adrp x8, var8 ; CHECK-NEXT: add x8, x8, :lo12:var8 ; CHECK-NEXT: stlrb w0, [x8] +; CHECK-NEXT: dmb ish ; CHECK-NEXT: ret store atomic i8 %val, ptr @var8 seq_cst, align 1 ret void