diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16669,25 +16669,36 @@ unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; - // Nand not supported in LSE. - if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; - // Leave 128 bits to LLSC. - if (Subtarget->hasLSE() && Size < 128) - return AtomicExpansionKind::None; - if (Subtarget->outlineAtomics() && Size < 128) { - // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. - // Don't outline them unless - // (1) high level support approved: - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf - // (2) low level libgcc and compiler-rt support implemented by: - // min/max outline atomics helpers - if (AI->getOperation() != AtomicRMWInst::Min && - AI->getOperation() != AtomicRMWInst::Max && - AI->getOperation() != AtomicRMWInst::UMin && - AI->getOperation() != AtomicRMWInst::UMax) { + + // Nand is not supported in LSE. + // Leave 128 bits to LLSC or CmpXChg. + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (Subtarget->hasLSE()) return AtomicExpansionKind::None; + if (Subtarget->outlineAtomics()) { + // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. + // Don't outline them unless + // (1) high level support approved: + // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf + // (2) low level libgcc and compiler-rt support implemented by: + // min/max outline atomics helpers + if (AI->getOperation() != AtomicRMWInst::Min && + AI->getOperation() != AtomicRMWInst::Max && + AI->getOperation() != AtomicRMWInst::UMin && + AI->getOperation() != AtomicRMWInst::UMax) { + return AtomicExpansionKind::None; + } } } + + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement atomicrmw without spilling. If the target address is also on the + // stack and close enough to the spill slot, this can lead to a situation + // where the monitor always gets cleared and the atomic operation can never + // succeed. So at -O0 lower this operation to a CAS loop. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::LLSC; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -225,22 +225,37 @@ ; ; CHECK-NOLSE-O0-LABEL: fetch_and_nand: ; CHECK-NOLSE-O0: ; %bb.0: -; CHECK-NOLSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NOLSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-NOLSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: ldr w8, [x0] +; CHECK-NOLSE-O0-NEXT: str w8, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: LBB4_1: ; %atomicrmw.start -; CHECK-NOLSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NOLSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: ldxr w8, [x10] -; CHECK-NOLSE-O0-NEXT: ; kill: def $x8 killed $w8 -; CHECK-NOLSE-O0-NEXT: ; kill: def $w8 killed $w8 killed $x8 -; CHECK-NOLSE-O0-NEXT: str w8, [sp, #4] ; 4-byte Folded Spill -; CHECK-NOLSE-O0-NEXT: and w8, w8, #0x7 -; CHECK-NOLSE-O0-NEXT: mvn w9, w8 -; CHECK-NOLSE-O0-NEXT: stlxr w8, w9, [x10] -; CHECK-NOLSE-O0-NEXT: cbnz w8, LBB4_1 -; CHECK-NOLSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NOLSE-O0-NEXT: ldr w0, [sp, #4] ; 4-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-NOLSE-O0-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NOLSE-O0-NEXT: ; Child Loop BB4_2 Depth 2 +; CHECK-NOLSE-O0-NEXT: ldr w8, [sp, #28] ; 4-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: and w9, w8, #0x7 +; CHECK-NOLSE-O0-NEXT: mvn w12, w9 +; CHECK-NOLSE-O0-NEXT: LBB4_2: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; Parent Loop BB4_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NOLSE-O0-NEXT: ldaxr w9, [x11] +; CHECK-NOLSE-O0-NEXT: cmp w9, w8 +; CHECK-NOLSE-O0-NEXT: b.ne LBB4_4 +; CHECK-NOLSE-O0-NEXT: ; %bb.3: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB4_2 Depth=2 +; CHECK-NOLSE-O0-NEXT: stlxr w10, w12, [x11] +; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB4_2 +; CHECK-NOLSE-O0-NEXT: LBB4_4: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 +; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB4_1 +; CHECK-NOLSE-O0-NEXT: ; %bb.5: ; %atomicrmw.end +; CHECK-NOLSE-O0-NEXT: ldr w0, [sp, #12] ; 4-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-NOLSE-O0-NEXT: ret ; ; CHECK-LSE-O1-LABEL: fetch_and_nand: @@ -258,22 +273,26 @@ ; ; CHECK-LSE-O0-LABEL: fetch_and_nand: ; CHECK-LSE-O0: ; %bb.0: -; CHECK-LSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-LSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-LSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: ldr w8, [x0] +; CHECK-LSE-O0-NEXT: str w8, [sp, #28] ; 4-byte Folded Spill ; CHECK-LSE-O0-NEXT: LBB4_1: ; %atomicrmw.start ; CHECK-LSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-LSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-LSE-O0-NEXT: ldxr w8, [x10] -; CHECK-LSE-O0-NEXT: ; kill: def $x8 killed $w8 -; CHECK-LSE-O0-NEXT: ; kill: def $w8 killed $w8 killed $x8 -; CHECK-LSE-O0-NEXT: str w8, [sp, #4] ; 4-byte Folded Spill -; CHECK-LSE-O0-NEXT: and w8, w8, #0x7 -; CHECK-LSE-O0-NEXT: mvn w9, w8 -; CHECK-LSE-O0-NEXT: stlxr w8, w9, [x10] -; CHECK-LSE-O0-NEXT: cbnz w8, LBB4_1 +; CHECK-LSE-O0-NEXT: ldr w8, [sp, #28] ; 4-byte Folded Reload +; CHECK-LSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; CHECK-LSE-O0-NEXT: and w9, w8, #0x7 +; CHECK-LSE-O0-NEXT: mvn w10, w9 +; CHECK-LSE-O0-NEXT: mov x9, x8 +; CHECK-LSE-O0-NEXT: casl w9, w10, [x11] +; CHECK-LSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill +; CHECK-LSE-O0-NEXT: subs w8, w9, w8 +; CHECK-LSE-O0-NEXT: cset w8, eq +; CHECK-LSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill +; CHECK-LSE-O0-NEXT: tbz w8, #0, LBB4_1 ; CHECK-LSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-LSE-O0-NEXT: ldr w0, [sp, #4] ; 4-byte Folded Reload -; CHECK-LSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-LSE-O0-NEXT: ldr w0, [sp, #12] ; 4-byte Folded Reload +; CHECK-LSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-LSE-O0-NEXT: ret %val = atomicrmw nand i32* %p, i32 7 release ret i32 %val @@ -295,20 +314,37 @@ ; ; CHECK-NOLSE-O0-LABEL: fetch_and_nand_64: ; CHECK-NOLSE-O0: ; %bb.0: -; CHECK-NOLSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NOLSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-NOLSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: ldr x8, [x0] +; CHECK-NOLSE-O0-NEXT: str x8, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: LBB5_1: ; %atomicrmw.start -; CHECK-NOLSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NOLSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: ldaxr x8, [x10] -; CHECK-NOLSE-O0-NEXT: str x8, [sp] ; 8-byte Folded Spill -; CHECK-NOLSE-O0-NEXT: and x8, x8, #0x7 -; CHECK-NOLSE-O0-NEXT: mvn x9, x8 -; CHECK-NOLSE-O0-NEXT: stlxr w8, x9, [x10] -; CHECK-NOLSE-O0-NEXT: cbnz w8, LBB5_1 -; CHECK-NOLSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NOLSE-O0-NEXT: ldr x0, [sp] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-NOLSE-O0-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NOLSE-O0-NEXT: ; Child Loop BB5_2 Depth 2 +; CHECK-NOLSE-O0-NEXT: ldr x8, [sp, #24] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: and x9, x8, #0x7 +; CHECK-NOLSE-O0-NEXT: mvn x12, x9 +; CHECK-NOLSE-O0-NEXT: LBB5_2: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; Parent Loop BB5_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NOLSE-O0-NEXT: ldaxr x9, [x11] +; CHECK-NOLSE-O0-NEXT: cmp x9, x8 +; CHECK-NOLSE-O0-NEXT: b.ne LBB5_4 +; CHECK-NOLSE-O0-NEXT: ; %bb.3: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB5_2 Depth=2 +; CHECK-NOLSE-O0-NEXT: stlxr w10, x12, [x11] +; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB5_2 +; CHECK-NOLSE-O0-NEXT: LBB5_4: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 +; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB5_1 +; CHECK-NOLSE-O0-NEXT: ; %bb.5: ; %atomicrmw.end +; CHECK-NOLSE-O0-NEXT: ldr x0, [sp, #8] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-NOLSE-O0-NEXT: ret ; ; CHECK-LSE-O1-LABEL: fetch_and_nand_64: @@ -326,20 +362,26 @@ ; ; CHECK-LSE-O0-LABEL: fetch_and_nand_64: ; CHECK-LSE-O0: ; %bb.0: -; CHECK-LSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-LSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-LSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: ldr x8, [x0] +; CHECK-LSE-O0-NEXT: str x8, [sp, #24] ; 8-byte Folded Spill ; CHECK-LSE-O0-NEXT: LBB5_1: ; %atomicrmw.start ; CHECK-LSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-LSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-LSE-O0-NEXT: ldaxr x8, [x10] -; CHECK-LSE-O0-NEXT: str x8, [sp] ; 8-byte Folded Spill -; CHECK-LSE-O0-NEXT: and x8, x8, #0x7 -; CHECK-LSE-O0-NEXT: mvn x9, x8 -; CHECK-LSE-O0-NEXT: stlxr w8, x9, [x10] -; CHECK-LSE-O0-NEXT: cbnz w8, LBB5_1 +; CHECK-LSE-O0-NEXT: ldr x8, [sp, #24] ; 8-byte Folded Reload +; CHECK-LSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; CHECK-LSE-O0-NEXT: and x9, x8, #0x7 +; CHECK-LSE-O0-NEXT: mvn x10, x9 +; CHECK-LSE-O0-NEXT: mov x9, x8 +; CHECK-LSE-O0-NEXT: casal x9, x10, [x11] +; CHECK-LSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: subs x8, x9, x8 +; CHECK-LSE-O0-NEXT: cset w8, eq +; CHECK-LSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill +; CHECK-LSE-O0-NEXT: tbz w8, #0, LBB5_1 ; CHECK-LSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-LSE-O0-NEXT: ldr x0, [sp] ; 8-byte Folded Reload -; CHECK-LSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-LSE-O0-NEXT: ldr x0, [sp, #8] ; 8-byte Folded Reload +; CHECK-LSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-LSE-O0-NEXT: ret %val = atomicrmw nand i64* %p, i64 7 acq_rel ret i64 %val @@ -361,22 +403,37 @@ ; ; CHECK-NOLSE-O0-LABEL: fetch_and_or: ; CHECK-NOLSE-O0: ; %bb.0: -; CHECK-NOLSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NOLSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-NOLSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: ldr w8, [x0] +; CHECK-NOLSE-O0-NEXT: str w8, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: LBB6_1: ; %atomicrmw.start -; CHECK-NOLSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NOLSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: ldaxr w8, [x10] -; CHECK-NOLSE-O0-NEXT: ; kill: def $x8 killed $w8 -; CHECK-NOLSE-O0-NEXT: ; kill: def $w8 killed $w8 killed $x8 -; CHECK-NOLSE-O0-NEXT: str w8, [sp, #4] ; 4-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NOLSE-O0-NEXT: ; Child Loop BB6_2 Depth 2 +; CHECK-NOLSE-O0-NEXT: ldr w8, [sp, #28] ; 4-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload ; CHECK-NOLSE-O0-NEXT: mov w9, #5 -; CHECK-NOLSE-O0-NEXT: orr w9, w8, w9 -; CHECK-NOLSE-O0-NEXT: stlxr w8, w9, [x10] -; CHECK-NOLSE-O0-NEXT: cbnz w8, LBB6_1 -; CHECK-NOLSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NOLSE-O0-NEXT: ldr w0, [sp, #4] ; 4-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-NOLSE-O0-NEXT: orr w12, w8, w9 +; CHECK-NOLSE-O0-NEXT: LBB6_2: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; Parent Loop BB6_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NOLSE-O0-NEXT: ldaxr w9, [x11] +; CHECK-NOLSE-O0-NEXT: cmp w9, w8 +; CHECK-NOLSE-O0-NEXT: b.ne LBB6_4 +; CHECK-NOLSE-O0-NEXT: ; %bb.3: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB6_2 Depth=2 +; CHECK-NOLSE-O0-NEXT: stlxr w10, w12, [x11] +; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB6_2 +; CHECK-NOLSE-O0-NEXT: LBB6_4: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB6_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 +; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB6_1 +; CHECK-NOLSE-O0-NEXT: ; %bb.5: ; %atomicrmw.end +; CHECK-NOLSE-O0-NEXT: ldr w0, [sp, #12] ; 4-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-NOLSE-O0-NEXT: ret ; ; CHECK-LSE-O1-LABEL: fetch_and_or: @@ -409,19 +466,36 @@ ; ; CHECK-NOLSE-O0-LABEL: fetch_and_or_64: ; CHECK-NOLSE-O0: ; %bb.0: -; CHECK-NOLSE-O0-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NOLSE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: sub sp, sp, #32 ; =32 +; CHECK-NOLSE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: ldr x8, [x0] +; CHECK-NOLSE-O0-NEXT: str x8, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: LBB7_1: ; %atomicrmw.start -; CHECK-NOLSE-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NOLSE-O0-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: ldxr x8, [x10] -; CHECK-NOLSE-O0-NEXT: str x8, [sp] ; 8-byte Folded Spill -; CHECK-NOLSE-O0-NEXT: orr x9, x8, #0x7 -; CHECK-NOLSE-O0-NEXT: stxr w8, x9, [x10] -; CHECK-NOLSE-O0-NEXT: cbnz w8, LBB7_1 -; CHECK-NOLSE-O0-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NOLSE-O0-NEXT: ldr x0, [sp] ; 8-byte Folded Reload -; CHECK-NOLSE-O0-NEXT: add sp, sp, #16 ; =16 +; CHECK-NOLSE-O0-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NOLSE-O0-NEXT: ; Child Loop BB7_2 Depth 2 +; CHECK-NOLSE-O0-NEXT: ldr x8, [sp, #24] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: orr x12, x8, #0x7 +; CHECK-NOLSE-O0-NEXT: LBB7_2: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; Parent Loop BB7_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NOLSE-O0-NEXT: ldaxr x9, [x11] +; CHECK-NOLSE-O0-NEXT: cmp x9, x8 +; CHECK-NOLSE-O0-NEXT: b.ne LBB7_4 +; CHECK-NOLSE-O0-NEXT: ; %bb.3: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB7_2 Depth=2 +; CHECK-NOLSE-O0-NEXT: stlxr w10, x12, [x11] +; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB7_2 +; CHECK-NOLSE-O0-NEXT: LBB7_4: ; %atomicrmw.start +; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB7_1 Depth=1 +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 +; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill +; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB7_1 +; CHECK-NOLSE-O0-NEXT: ; %bb.5: ; %atomicrmw.end +; CHECK-NOLSE-O0-NEXT: ldr x0, [sp, #8] ; 8-byte Folded Reload +; CHECK-NOLSE-O0-NEXT: add sp, sp, #32 ; =32 ; CHECK-NOLSE-O0-NEXT: ret ; ; CHECK-LSE-O1-LABEL: fetch_and_or_64: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -0,0 +1,697 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=NOLSE +; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -mattr=+lse -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=LSE + +; Ensure there's no stack spill in between ldxr/stxr pairs. + +define i8 @test_rmw_add_8(i8* %dst) { +; NOLSE-LABEL: test_rmw_add_8: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldrb w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add w12, w9, #1 // =1 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrb w8, [x11] +; NOLSE-NEXT: cmp w8, w9, uxtb +; NOLSE-NEXT: b.ne .LBB0_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=2 +; NOLSE-NEXT: stlxrb w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB0_2 +; NOLSE-NEXT: .LBB0_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9, uxtb +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: b .LBB0_5 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_add_8: +; LSE: // %bb.0: // %entry +; LSE-NEXT: mov w8, #1 +; LSE-NEXT: ldaddalb w8, w0, [x0] +; LSE-NEXT: ret +entry: + %res = atomicrmw add i8* %dst, i8 1 seq_cst + ret i8 %res +} + +define i16 @test_rmw_add_16(i16* %dst) { +; NOLSE-LABEL: test_rmw_add_16: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldrh w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add w12, w9, #1 // =1 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w8, [x11] +; NOLSE-NEXT: cmp w8, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=2 +; NOLSE-NEXT: stlxrh w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB1_2 +; NOLSE-NEXT: .LBB1_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9, uxth +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: b .LBB1_5 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_add_16: +; LSE: // %bb.0: // %entry +; LSE-NEXT: mov w8, #1 +; LSE-NEXT: ldaddalh w8, w0, [x0] +; LSE-NEXT: ret +entry: + %res = atomicrmw add i16* %dst, i16 1 seq_cst + ret i16 %res +} + +define i32 @test_rmw_add_32(i32* %dst) { +; NOLSE-LABEL: test_rmw_add_32: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldr w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add w12, w9, #1 // =1 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w8, [x11] +; NOLSE-NEXT: cmp w8, w9 +; NOLSE-NEXT: b.ne .LBB2_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=2 +; NOLSE-NEXT: stlxr w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB2_2 +; NOLSE-NEXT: .LBB2_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9 +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: b .LBB2_5 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_add_32: +; LSE: // %bb.0: // %entry +; LSE-NEXT: mov w8, #1 +; LSE-NEXT: ldaddal w8, w0, [x0] +; LSE-NEXT: ret +entry: + %res = atomicrmw add i32* %dst, i32 1 seq_cst + ret i32 %res +} + +define i64 @test_rmw_add_64(i64* %dst) { +; NOLSE-LABEL: test_rmw_add_64: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldr x8, [x0] +; NOLSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_2 Depth 2 +; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add x12, x9, #1 // =1 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x8, [x11] +; NOLSE-NEXT: cmp x8, x9 +; NOLSE-NEXT: b.ne .LBB3_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=2 +; NOLSE-NEXT: stlxr w10, x12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB3_2 +; NOLSE-NEXT: .LBB3_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_1 Depth=1 +; NOLSE-NEXT: subs x9, x8, x9 +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: b .LBB3_5 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_add_64: +; LSE: // %bb.0: // %entry +; LSE-NEXT: mov w8, #1 +; LSE-NEXT: // kill: def $x8 killed $w8 +; LSE-NEXT: ldaddal x8, x0, [x0] +; LSE-NEXT: ret +entry: + %res = atomicrmw add i64* %dst, i64 1 seq_cst + ret i64 %res +} + +define i128 @test_rmw_add_128(i128* %dst) { +; NOLSE-LABEL: test_rmw_add_128: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #48 // =48 +; NOLSE-NEXT: .cfi_def_cfa_offset 48 +; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: ldr x8, [x0, #8] +; NOLSE-NEXT: ldr x9, [x0] +; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x8, #1 // =1 +; NOLSE-NEXT: mov x9, xzr +; NOLSE-NEXT: adcs x15, x11, x9 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x10, x9, [x13] +; NOLSE-NEXT: cmp x10, x8 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x9, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB4_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 +; NOLSE-NEXT: eor x11, x9, x11 +; NOLSE-NEXT: eor x8, x10, x8 +; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: cbnz x8, .LBB4_1 +; NOLSE-NEXT: b .LBB4_5 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #48 // =48 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_add_128: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #80 // =80 +; LSE-NEXT: .cfi_def_cfa_offset 80 +; LSE-NEXT: str x0, [sp, #56] // 8-byte Folded Spill +; LSE-NEXT: ldr x8, [x0, #8] +; LSE-NEXT: ldr x9, [x0] +; LSE-NEXT: str x9, [sp, #64] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #72] // 8-byte Folded Spill +; LSE-NEXT: b .LBB4_1 +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr x10, [sp, #72] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #64] // 8-byte Folded Reload +; LSE-NEXT: ldr x9, [sp, #56] // 8-byte Folded Reload +; LSE-NEXT: adds x2, x8, #1 // =1 +; LSE-NEXT: mov x11, xzr +; LSE-NEXT: adcs x11, x10, x11 +; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 +; LSE-NEXT: mov x3, x11 +; LSE-NEXT: mov x0, x8 +; LSE-NEXT: mov x1, x10 +; LSE-NEXT: stp x0, x1, [sp, #8] // 16-byte Folded Spill +; LSE-NEXT: caspal x0, x1, x2, x3, [x9] +; LSE-NEXT: stp x0, x1, [sp, #24] // 16-byte Folded Spill +; LSE-NEXT: mov x9, x1 +; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: eor x11, x9, x10 +; LSE-NEXT: mov x10, x0 +; LSE-NEXT: str x10, [sp, #48] // 8-byte Folded Spill +; LSE-NEXT: eor x8, x10, x8 +; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: str x10, [sp, #64] // 8-byte Folded Spill +; LSE-NEXT: str x9, [sp, #72] // 8-byte Folded Spill +; LSE-NEXT: cbnz x8, .LBB4_1 +; LSE-NEXT: b .LBB4_2 +; LSE-NEXT: .LBB4_2: // %atomicrmw.end +; LSE-NEXT: ldr x1, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x0, [sp, #48] // 8-byte Folded Reload +; LSE-NEXT: add sp, sp, #80 // =80 +; LSE-NEXT: ret +entry: + %res = atomicrmw add i128* %dst, i128 1 seq_cst + ret i128 %res +} +define i8 @test_rmw_nand_8(i8* %dst) { +; NOLSE-LABEL: test_rmw_nand_8: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldrb w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: mvn w8, w9 +; NOLSE-NEXT: orr w12, w8, #0xfffffffe +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrb w8, [x11] +; NOLSE-NEXT: cmp w8, w9, uxtb +; NOLSE-NEXT: b.ne .LBB5_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=2 +; NOLSE-NEXT: stlxrb w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB5_2 +; NOLSE-NEXT: .LBB5_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9, uxtb +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: b .LBB5_5 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_nand_8: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldrb w8, [x0] +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b .LBB5_1 +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: mvn w8, w9 +; LSE-NEXT: orr w10, w8, #0xfffffffe +; LSE-NEXT: mov w8, w9 +; LSE-NEXT: casalb w8, w10, [x11] +; LSE-NEXT: str w8, [sp, #8] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w8, w9, uxtb +; LSE-NEXT: cset w9, eq +; LSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w9, #1 // =1 +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: b .LBB5_2 +; LSE-NEXT: .LBB5_2: // %atomicrmw.end +; LSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret +entry: + %res = atomicrmw nand i8* %dst, i8 1 seq_cst + ret i8 %res +} + +define i16 @test_rmw_nand_16(i16* %dst) { +; NOLSE-LABEL: test_rmw_nand_16: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldrh w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: mvn w8, w9 +; NOLSE-NEXT: orr w12, w8, #0xfffffffe +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w8, [x11] +; NOLSE-NEXT: cmp w8, w9, uxth +; NOLSE-NEXT: b.ne .LBB6_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=2 +; NOLSE-NEXT: stlxrh w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB6_2 +; NOLSE-NEXT: .LBB6_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9, uxth +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB6_1 +; NOLSE-NEXT: b .LBB6_5 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_nand_16: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldrh w8, [x0] +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b .LBB6_1 +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: mvn w8, w9 +; LSE-NEXT: orr w10, w8, #0xfffffffe +; LSE-NEXT: mov w8, w9 +; LSE-NEXT: casalh w8, w10, [x11] +; LSE-NEXT: str w8, [sp, #8] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w8, w9, uxth +; LSE-NEXT: cset w9, eq +; LSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w9, #1 // =1 +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: b .LBB6_2 +; LSE-NEXT: .LBB6_2: // %atomicrmw.end +; LSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret +entry: + %res = atomicrmw nand i16* %dst, i16 1 seq_cst + ret i16 %res +} + +define i32 @test_rmw_nand_32(i32* %dst) { +; NOLSE-LABEL: test_rmw_nand_32: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldr w8, [x0] +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_2 Depth 2 +; NOLSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: mvn w8, w9 +; NOLSE-NEXT: orr w12, w8, #0xfffffffe +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w8, [x11] +; NOLSE-NEXT: cmp w8, w9 +; NOLSE-NEXT: b.ne .LBB7_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=2 +; NOLSE-NEXT: stlxr w10, w12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB7_2 +; NOLSE-NEXT: .LBB7_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_1 Depth=1 +; NOLSE-NEXT: subs w9, w8, w9 +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: b .LBB7_5 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_nand_32: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldr w8, [x0] +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b .LBB7_1 +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: mvn w8, w9 +; LSE-NEXT: orr w10, w8, #0xfffffffe +; LSE-NEXT: mov w8, w9 +; LSE-NEXT: casal w8, w10, [x11] +; LSE-NEXT: str w8, [sp, #8] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w8, w9 +; LSE-NEXT: cset w9, eq +; LSE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; LSE-NEXT: subs w9, w9, #1 // =1 +; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: b .LBB7_2 +; LSE-NEXT: .LBB7_2: // %atomicrmw.end +; LSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret +entry: + %res = atomicrmw nand i32* %dst, i32 1 seq_cst + ret i32 %res +} + +define i64 @test_rmw_nand_64(i64* %dst) { +; NOLSE-LABEL: test_rmw_nand_64: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #32 // =32 +; NOLSE-NEXT: .cfi_def_cfa_offset 32 +; NOLSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: ldr x8, [x0] +; NOLSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_2 Depth 2 +; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: mov w8, w9 +; NOLSE-NEXT: mvn w10, w8 +; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: mov w8, w10 +; NOLSE-NEXT: orr x12, x8, #0xfffffffffffffffe +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x8, [x11] +; NOLSE-NEXT: cmp x8, x9 +; NOLSE-NEXT: b.ne .LBB8_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=2 +; NOLSE-NEXT: stlxr w10, x12, [x11] +; NOLSE-NEXT: cbnz w10, .LBB8_2 +; NOLSE-NEXT: .LBB8_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_1 Depth=1 +; NOLSE-NEXT: subs x9, x8, x9 +; NOLSE-NEXT: cset w9, eq +; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: subs w9, w9, #1 // =1 +; NOLSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: b .LBB8_5 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #32 // =32 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_nand_64: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #32 // =32 +; LSE-NEXT: .cfi_def_cfa_offset 32 +; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: ldr x8, [x0] +; LSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; LSE-NEXT: b .LBB8_1 +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: mov w8, w9 +; LSE-NEXT: mvn w10, w8 +; LSE-NEXT: // implicit-def: $x8 +; LSE-NEXT: mov w8, w10 +; LSE-NEXT: orr x10, x8, #0xfffffffffffffffe +; LSE-NEXT: mov x8, x9 +; LSE-NEXT: casal x8, x10, [x11] +; LSE-NEXT: str x8, [sp] // 8-byte Folded Spill +; LSE-NEXT: subs x9, x8, x9 +; LSE-NEXT: cset w9, eq +; LSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; LSE-NEXT: subs w9, w9, #1 // =1 +; LSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: b .LBB8_2 +; LSE-NEXT: .LBB8_2: // %atomicrmw.end +; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload +; LSE-NEXT: add sp, sp, #32 // =32 +; LSE-NEXT: ret +entry: + %res = atomicrmw nand i64* %dst, i64 1 seq_cst + ret i64 %res +} + +define i128 @test_rmw_nand_128(i128* %dst) { +; NOLSE-LABEL: test_rmw_nand_128: +; NOLSE: // %bb.0: // %entry +; NOLSE-NEXT: sub sp, sp, #48 // =48 +; NOLSE-NEXT: .cfi_def_cfa_offset 48 +; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill +; NOLSE-NEXT: ldr x8, [x0, #8] +; NOLSE-NEXT: ldr x9, [x0] +; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: mov w9, w8 +; NOLSE-NEXT: mvn w10, w9 +; NOLSE-NEXT: // implicit-def: $x9 +; NOLSE-NEXT: mov w9, w10 +; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe +; NOLSE-NEXT: mov x15, #-1 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x10, x9, [x13] +; NOLSE-NEXT: cmp x10, x8 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x9, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB9_4 +; NOLSE-NEXT: // %bb.3: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 +; NOLSE-NEXT: eor x11, x9, x11 +; NOLSE-NEXT: eor x8, x10, x8 +; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: cbnz x8, .LBB9_1 +; NOLSE-NEXT: b .LBB9_5 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #48 // =48 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_rmw_nand_128: +; LSE: // %bb.0: // %entry +; LSE-NEXT: sub sp, sp, #80 // =80 +; LSE-NEXT: .cfi_def_cfa_offset 80 +; LSE-NEXT: str x0, [sp, #56] // 8-byte Folded Spill +; LSE-NEXT: ldr x8, [x0, #8] +; LSE-NEXT: ldr x9, [x0] +; LSE-NEXT: str x9, [sp, #64] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #72] // 8-byte Folded Spill +; LSE-NEXT: b .LBB9_1 +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: ldr x10, [sp, #72] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #64] // 8-byte Folded Reload +; LSE-NEXT: ldr x9, [sp, #56] // 8-byte Folded Reload +; LSE-NEXT: mov x0, x8 +; LSE-NEXT: mov x1, x10 +; LSE-NEXT: stp x0, x1, [sp, #8] // 16-byte Folded Spill +; LSE-NEXT: mov w11, w8 +; LSE-NEXT: mvn w12, w11 +; LSE-NEXT: // implicit-def: $x11 +; LSE-NEXT: mov w11, w12 +; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe +; LSE-NEXT: mov x11, #-1 +; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 +; LSE-NEXT: mov x3, x11 +; LSE-NEXT: caspal x0, x1, x2, x3, [x9] +; LSE-NEXT: stp x0, x1, [sp, #24] // 16-byte Folded Spill +; LSE-NEXT: mov x9, x1 +; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: eor x11, x9, x10 +; LSE-NEXT: mov x10, x0 +; LSE-NEXT: str x10, [sp, #48] // 8-byte Folded Spill +; LSE-NEXT: eor x8, x10, x8 +; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: str x10, [sp, #64] // 8-byte Folded Spill +; LSE-NEXT: str x9, [sp, #72] // 8-byte Folded Spill +; LSE-NEXT: cbnz x8, .LBB9_1 +; LSE-NEXT: b .LBB9_2 +; LSE-NEXT: .LBB9_2: // %atomicrmw.end +; LSE-NEXT: ldr x1, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x0, [sp, #48] // 8-byte Folded Reload +; LSE-NEXT: add sp, sp, #80 // =80 +; LSE-NEXT: ret +entry: + %res = atomicrmw nand i128* %dst, i128 1 seq_cst + ret i128 %res +} diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll --- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll +++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s -; RUN: opt -S -mtriple=aarch64-- -mattr=+outline-atomics -atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS +; RUN: opt -O1 -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s +; RUN: opt -O1 -S -mtriple=aarch64-- -mattr=+outline-atomics -atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS define void @atomic_swap_f16(half* %ptr, half %val) nounwind { ; CHECK-LABEL: @atomic_swap_f16(