Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16649,25 +16649,36 @@ unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; - // Nand not supported in LSE. - if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; - // Leave 128 bits to LLSC. - if (Subtarget->hasLSE() && Size < 128) - return AtomicExpansionKind::None; - if (Subtarget->outlineAtomics() && Size < 128) { - // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. - // Don't outline them unless - // (1) high level support approved: - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf - // (2) low level libgcc and compiler-rt support implemented by: - // min/max outline atomics helpers - if (AI->getOperation() != AtomicRMWInst::Min && - AI->getOperation() != AtomicRMWInst::Max && - AI->getOperation() != AtomicRMWInst::UMin && - AI->getOperation() != AtomicRMWInst::UMax) { + + // Nand is not supported in LSE. + // Leave 128 bits to LLSC or CmpXChg. + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (Subtarget->hasLSE()) return AtomicExpansionKind::None; + if (Subtarget->outlineAtomics()) { + // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. + // Don't outline them unless + // (1) high level support approved: + // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf + // (2) low level libgcc and compiler-rt support implemented by: + // min/max outline atomics helpers + if (AI->getOperation() != AtomicRMWInst::Min && + AI->getOperation() != AtomicRMWInst::Max && + AI->getOperation() != AtomicRMWInst::UMin && + AI->getOperation() != AtomicRMWInst::UMax) { + return AtomicExpansionKind::None; + } } } + + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::LLSC; } Index: llvm/test/CodeGen/AArch64/atomicrmw-O0.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s + +; Ensure there's no stack spill in between ldxr/stxr pairs. + +define i8 @test_rmw_add_8(i8* %dst) optnone noinline { +; CHECK-LABEL: test_rmw_add_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB0_2 Depth 2 +; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add w12, w9, #1 // =1 +; CHECK-NEXT: .LBB0_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB0_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxrb w8, [x11] +; CHECK-NEXT: cmp w8, w9, uxtb +; CHECK-NEXT: b.ne .LBB0_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=2 +; CHECK-NEXT: stlxrb w10, w12, [x11] +; CHECK-NEXT: cbnz w10, .LBB0_2 +; CHECK-NEXT: .LBB0_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: subs w9, w8, w9, uxtb +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: b .LBB0_5 +; CHECK-NEXT: .LBB0_5: // %atomicrmw.end +; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +entry: + %res = atomicrmw add i8* %dst, i8 1 seq_cst + ret i8 %res +} + +define i16 @test_rmw_add_16(i16* %dst) optnone noinline { +; CHECK-LABEL: test_rmw_add_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB1_2 Depth 2 +; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add w12, w9, #1 // =1 +; CHECK-NEXT: .LBB1_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxrh w8, [x11] +; CHECK-NEXT: cmp w8, w9, uxth +; CHECK-NEXT: b.ne .LBB1_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=2 +; CHECK-NEXT: stlxrh w10, w12, [x11] +; CHECK-NEXT: cbnz w10, .LBB1_2 +; CHECK-NEXT: .LBB1_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: subs w9, w8, w9, uxth +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: .LBB1_5: // %atomicrmw.end +; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +entry: + %res = atomicrmw add i16* %dst, i16 1 seq_cst + ret i16 %res +} + +define i32 @test_rmw_add_32(i32* %dst) optnone noinline { +; CHECK-LABEL: test_rmw_add_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB2_2 Depth 2 +; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add w12, w9, #1 // =1 +; CHECK-NEXT: .LBB2_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr w8, [x11] +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: b.ne .LBB2_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB2_2 Depth=2 +; CHECK-NEXT: stlxr w10, w12, [x11] +; CHECK-NEXT: cbnz w10, .LBB2_2 +; CHECK-NEXT: .LBB2_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: subs w9, w8, w9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b.ne .LBB2_1 +; CHECK-NEXT: b .LBB2_5 +; CHECK-NEXT: .LBB2_5: // %atomicrmw.end +; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +entry: + %res = atomicrmw add i32* %dst, i32 1 seq_cst + ret i32 %res +} + +define i64 @test_rmw_add_64(i64* %dst) optnone noinline { +; CHECK-LABEL: test_rmw_add_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB3_2 Depth 2 +; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add x12, x9, #1 // =1 +; CHECK-NEXT: .LBB3_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr x8, [x11] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.ne .LBB3_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB3_2 Depth=2 +; CHECK-NEXT: stlxr w10, x12, [x11] +; CHECK-NEXT: cbnz w10, .LBB3_2 +; CHECK-NEXT: .LBB3_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: subs x9, x8, x9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b.ne .LBB3_1 +; CHECK-NEXT: b .LBB3_5 +; CHECK-NEXT: .LBB3_5: // %atomicrmw.end +; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +entry: + %res = atomicrmw add i64* %dst, i64 1 seq_cst + ret i64 %res +} + +define i128 @test_rmw_add_128(i128* %dst) optnone noinline { +; CHECK-LABEL: test_rmw_add_128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 // =48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: str x0, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: ldr x8, [x0, #8] +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB4_2 Depth 2 +; CHECK-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; CHECK-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: adds x14, x8, #1 // =1 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: adcs x15, x11, x9 +; CHECK-NEXT: .LBB4_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB4_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxp x10, x9, [x13] +; CHECK-NEXT: cmp x10, x8 +; CHECK-NEXT: cset w12, ne +; CHECK-NEXT: cmp x9, x11 +; CHECK-NEXT: cinc w12, w12, ne +; CHECK-NEXT: cbnz w12, .LBB4_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=2 +; CHECK-NEXT: stlxp w12, x14, x15, [x13] +; CHECK-NEXT: cbnz w12, .LBB4_2 +; CHECK-NEXT: .LBB4_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1 +; CHECK-NEXT: eor x11, x9, x11 +; CHECK-NEXT: eor x8, x10, x8 +; CHECK-NEXT: orr x8, x8, x11 +; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: str x10, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: cbnz x8, .LBB4_1 +; CHECK-NEXT: b .LBB4_5 +; CHECK-NEXT: .LBB4_5: // %atomicrmw.end +; CHECK-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 // =48 +; CHECK-NEXT: ret +entry: + %res = atomicrmw add i128* %dst, i128 1 seq_cst + ret i128 %res +}