diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16665,10 +16665,10 @@ unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; - // Nand not supported in LSE. - if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; - // Leave 128 bits to LLSC. - if (Subtarget->hasLSE() && Size < 128) + // If we have LSE, don't expand non-NAND operations smaller than 128 bits. + // We want to use the native instructions. + if (Subtarget->hasLSE() && AI->getOperation() != AtomicRMWInst::Nand && + Size < 128) return AtomicExpansionKind::None; if (Subtarget->outlineAtomics() && Size < 128) { // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. @@ -16680,10 +16680,19 @@ if (AI->getOperation() != AtomicRMWInst::Min && AI->getOperation() != AtomicRMWInst::Max && AI->getOperation() != AtomicRMWInst::UMin && - AI->getOperation() != AtomicRMWInst::UMax) { + AI->getOperation() != AtomicRMWInst::UMax && + AI->getOperation() != AtomicRMWInst::Nand) { return AtomicExpansionKind::None; } } + // At -O0, use late-expanded pseudo-instruction. See comment in + // shouldExpandAtomicCmpXchgInIR. + // + // FIXME: Expanding to cmpxchg generates a really long nested loop; + // we could save a bunch of instructions by adding dedicated + // pseudo-instructions for each atomicrmw operation. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return AtomicExpansionKind::CmpXChg; return AtomicExpansionKind::LLSC; } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -0,0 +1,243 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm64-eabi -verify-machineinstrs -O0 | FileCheck -enable-var-scope %s +; RUN: llc < %s -mtriple=arm64-eabi -verify-machineinstrs -O0 -mattr=+lse | FileCheck -enable-var-scope -check-prefix=LSE %s + +; We need to ensure there aren't any load/store instructions between +; ldaxr and the stlxr. This has been an issue with fast regalloc. + +define i32 @fetch_and_nand(i32* %p) { +; CHECK-LABEL: fetch_and_nand: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB0_2 Depth 2 +; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: mvn w8, w9 +; CHECK-NEXT: orr w12, w8, #0xfffffff8 +; CHECK-NEXT: .LBB0_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB0_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr w8, [x11] +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: b.ne .LBB0_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=2 +; CHECK-NEXT: stlxr w10, w12, [x11] +; CHECK-NEXT: cbnz w10, .LBB0_2 +; CHECK-NEXT: .LBB0_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: subs w9, w8, w9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: b .LBB0_5 +; CHECK-NEXT: .LBB0_5: // %atomicrmw.end +; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +; +; LSE-LABEL: fetch_and_nand: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldr w8, [x0] +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: and w9, w8, #0x7 +; LSE-NEXT: mvn w10, w9 +; LSE-NEXT: mov w9, w8 +; LSE-NEXT: casl w9, w10, [x11] +; LSE-NEXT: str w9, [sp, #12] // 4-byte Folded Spill +; LSE-NEXT: subs w8, w9, w8 +; LSE-NEXT: cset w8, eq +; LSE-NEXT: str w9, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: tbz w8, #0, .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret + %val = atomicrmw nand i32* %p, i32 7 release + ret i32 %val +} + +define i64 @fetch_and_nand_64(i64* %p) { +; CHECK-LABEL: fetch_and_nand_64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB1_2 Depth 2 +; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: mov w8, w9 +; CHECK-NEXT: mvn w10, w8 +; CHECK-NEXT: // implicit-def: $x8 +; CHECK-NEXT: mov w8, w10 +; CHECK-NEXT: orr x12, x8, #0xfffffffffffffff8 +; CHECK-NEXT: .LBB1_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr x8, [x11] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.ne .LBB1_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=2 +; CHECK-NEXT: stlxr w10, x12, [x11] +; CHECK-NEXT: cbnz w10, .LBB1_2 +; CHECK-NEXT: .LBB1_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: subs x9, x8, x9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: .LBB1_5: // %atomicrmw.end +; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +; +; LSE-LABEL: fetch_and_nand_64: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldr x8, [x0] +; LSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: and x9, x8, #0x7 +; LSE-NEXT: mvn x10, x9 +; LSE-NEXT: mov x9, x8 +; LSE-NEXT: casal x9, x10, [x11] +; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; LSE-NEXT: subs x8, x9, x8 +; LSE-NEXT: cset w8, eq +; LSE-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; LSE-NEXT: tbz w8, #0, .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret + %val = atomicrmw nand i64* %p, i64 7 acq_rel + ret i64 %val +} + +define i32 @fetch_and_or(i32* %p) { +; CHECK-LABEL: fetch_and_or: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB2_2 Depth 2 +; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: orr w12, w9, w8 +; CHECK-NEXT: .LBB2_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr w8, [x11] +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: b.ne .LBB2_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB2_2 Depth=2 +; CHECK-NEXT: stlxr w10, w12, [x11] +; CHECK-NEXT: cbnz w10, .LBB2_2 +; CHECK-NEXT: .LBB2_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: subs w9, w8, w9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-NEXT: b.ne .LBB2_1 +; CHECK-NEXT: b .LBB2_5 +; CHECK-NEXT: .LBB2_5: // %atomicrmw.end +; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +; +; LSE-LABEL: fetch_and_or: +; LSE: // %bb.0: +; LSE-NEXT: mov w8, #5 +; LSE-NEXT: ldsetal w8, w0, [x0] +; LSE-NEXT: ret + %val = atomicrmw or i32* %p, i32 5 seq_cst + ret i32 %val +} + +define i64 @fetch_and_or_64(i64* %p) { +; CHECK-LABEL: fetch_and_or_64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB3_2 Depth 2 +; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: orr x12, x9, #0x7 +; CHECK-NEXT: .LBB3_2: // %atomicrmw.start +; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldaxr x8, [x11] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.ne .LBB3_4 +; CHECK-NEXT: // %bb.3: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB3_2 Depth=2 +; CHECK-NEXT: stlxr w10, x12, [x11] +; CHECK-NEXT: cbnz w10, .LBB3_2 +; CHECK-NEXT: .LBB3_4: // %atomicrmw.start +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: subs x9, x8, x9 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: subs w9, w9, #1 // =1 +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: b.ne .LBB3_1 +; CHECK-NEXT: b .LBB3_5 +; CHECK-NEXT: .LBB3_5: // %atomicrmw.end +; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +; +; LSE-LABEL: fetch_and_or_64: +; LSE: // %bb.0: +; LSE-NEXT: mov w8, #7 +; LSE-NEXT: // kill: def $x8 killed $w8 +; LSE-NEXT: ldset x8, x0, [x0] +; LSE-NEXT: ret + %val = atomicrmw or i64* %p, i64 7 monotonic + ret i64 %val +}