This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Avoid generating LL/SC for atomicrmw before regalloc at -O0.
AbandonedPublic

Authored by efriedma on Apr 23 2021, 5:54 PM.

Download Raw Diff

Details

Reviewers

t.p.northover
arsenm
rogfer01
mgorny

Summary

This is a retread of the issue we had in the past with cmpxchg: fast regalloc can generate spills in weird places, so the ll/sc sequence never terminates. To avoid this, don't use the early ll/sc expansion at -O0.

I didn't spend the time to implement individual instructions for each atomicrmw instruction; instead, it's just expanding to the known-good cmpxchg. This is inefficient, but not sure how much we care; we don't use these sequences with LSE or atomic outlining.

Fixes https://bugs.llvm.org/show_bug.cgi?id=48017

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	40 ms	x64 debian > LLVM.Transforms/AtomicExpand/AArch64::expand-atomicrmw-xchg-fp.ll
	50 ms	x64 windows > LLVM.Transforms/AtomicExpand/AArch64::expand-atomicrmw-xchg-fp.ll

Event Timeline

efriedma created this revision.Apr 23 2021, 5:54 PM

Herald added subscribers: tmatheson, danielkiss, jfb and 2 others. · View Herald TranscriptApr 23 2021, 5:54 PM

efriedma requested review of this revision.Apr 23 2021, 5:54 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 23 2021, 5:54 PM

Herald added a subscriber: wdng. · View Herald Transcript

Harbormaster completed remote builds in B100715: Diff 340213.Apr 23 2021, 7:25 PM

LemonBoy mentioned this in D101163: [AArch64] Prevent spilling between ldxr/stxr pairs.Apr 24 2021, 12:49 AM

See D101163

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

19 lines

test/

CodeGen/

AArch64/

atomicrmw-O0.ll

243 lines

Diff 340213

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 16,659 Lines • ▼ Show 20 Lines
	// For the real atomic operations, we have ldxr/stxr up to 128 bits,			// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind			TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {			AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())			if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;			return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();			unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;			if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.			// If we have LSE, don't expand non-NAND operations smaller than 128 bits.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;			// We want to use the native instructions.
	// Leave 128 bits to LLSC.			if (Subtarget->hasLSE() && AI->getOperation() != AtomicRMWInst::Nand &&
	if (Subtarget->hasLSE() && Size < 128)			Size < 128)
	return AtomicExpansionKind::None;			return AtomicExpansionKind::None;
	if (Subtarget->outlineAtomics() && Size < 128) {			if (Subtarget->outlineAtomics() && Size < 128) {
	// [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.			// [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
	// Don't outline them unless			// Don't outline them unless
	// (1) high level <atomic> support approved:			// (1) high level <atomic> support approved:
	// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf			// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
	// (2) low level libgcc and compiler-rt support implemented by:			// (2) low level libgcc and compiler-rt support implemented by:
	// min/max outline atomics helpers			// min/max outline atomics helpers
	if (AI->getOperation() != AtomicRMWInst::Min &&			if (AI->getOperation() != AtomicRMWInst::Min &&
	AI->getOperation() != AtomicRMWInst::Max &&			AI->getOperation() != AtomicRMWInst::Max &&
	AI->getOperation() != AtomicRMWInst::UMin &&			AI->getOperation() != AtomicRMWInst::UMin &&
	AI->getOperation() != AtomicRMWInst::UMax) {			AI->getOperation() != AtomicRMWInst::UMax &&
				AI->getOperation() != AtomicRMWInst::Nand) {
	return AtomicExpansionKind::None;			return AtomicExpansionKind::None;
	}			}
	}			}
				// At -O0, use late-expanded pseudo-instruction. See comment in
				// shouldExpandAtomicCmpXchgInIR.
				//
				// FIXME: Expanding to cmpxchg generates a really long nested loop;
				// we could save a bunch of instructions by adding dedicated
				// pseudo-instructions for each atomicrmw operation.
				if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
				return AtomicExpansionKind::CmpXChg;
	return AtomicExpansionKind::LLSC;			return AtomicExpansionKind::LLSC;
	}			}

	TargetLowering::AtomicExpansionKind			TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(			AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {			AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.			// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics())			if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics())
	▲ Show 20 Lines • Show All 937 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/atomicrmw-O0.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=arm64-eabi -verify-machineinstrs -O0 \| FileCheck -enable-var-scope %s
				; RUN: llc < %s -mtriple=arm64-eabi -verify-machineinstrs -O0 -mattr=+lse \| FileCheck -enable-var-scope -check-prefix=LSE %s

				; We need to ensure there aren't any load/store instructions between
				; ldaxr and the stlxr. This has been an issue with fast regalloc.

				define i32 @fetch_and_nand(i32* %p) {
				; CHECK-LABEL: fetch_and_nand:
				; CHECK: // %bb.0:
				; CHECK-NEXT: sub sp, sp, #32 // =32
				; CHECK-NEXT: .cfi_def_cfa_offset 32
				; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; CHECK-NEXT: ldr w8, [x0]
				; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill
				; CHECK-NEXT: b .LBB0_1
				; CHECK-NEXT: .LBB0_1: // %atomicrmw.start
				; CHECK-NEXT: // =>This Loop Header: Depth=1
				; CHECK-NEXT: // Child Loop BB0_2 Depth 2
				; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload
				; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; CHECK-NEXT: mvn w8, w9
				; CHECK-NEXT: orr w12, w8, #0xfffffff8
				; CHECK-NEXT: .LBB0_2: // %atomicrmw.start
				; CHECK-NEXT: // Parent Loop BB0_1 Depth=1
				; CHECK-NEXT: // => This Inner Loop Header: Depth=2
				; CHECK-NEXT: ldaxr w8, [x11]
				; CHECK-NEXT: cmp w8, w9
				; CHECK-NEXT: b.ne .LBB0_4
				; CHECK-NEXT: // %bb.3: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=2
				; CHECK-NEXT: stlxr w10, w12, [x11]
				; CHECK-NEXT: cbnz w10, .LBB0_2
				; CHECK-NEXT: .LBB0_4: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
				; CHECK-NEXT: subs w9, w8, w9
				; CHECK-NEXT: cset w9, eq
				; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill
				; CHECK-NEXT: subs w9, w9, #1 // =1
				; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill
				; CHECK-NEXT: b.ne .LBB0_1
				; CHECK-NEXT: b .LBB0_5
				; CHECK-NEXT: .LBB0_5: // %atomicrmw.end
				; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload
				; CHECK-NEXT: add sp, sp, #32 // =32
				; CHECK-NEXT: ret
				;
				; LSE-LABEL: fetch_and_nand:
				; LSE: // %bb.0:
				; LSE-NEXT: sub sp, sp, #32 // =32
				; LSE-NEXT: .cfi_def_cfa_offset 32
				; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; LSE-NEXT: ldr w8, [x0]
				; LSE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill
				; LSE-NEXT: .LBB0_1: // %atomicrmw.start
				; LSE-NEXT: // =>This Inner Loop Header: Depth=1
				; LSE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload
				; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; LSE-NEXT: and w9, w8, #0x7
				; LSE-NEXT: mvn w10, w9
				; LSE-NEXT: mov w9, w8
				; LSE-NEXT: casl w9, w10, [x11]
				; LSE-NEXT: str w9, [sp, #12] // 4-byte Folded Spill
				; LSE-NEXT: subs w8, w9, w8
				; LSE-NEXT: cset w8, eq
				; LSE-NEXT: str w9, [sp, #28] // 4-byte Folded Spill
				; LSE-NEXT: tbz w8, #0, .LBB0_1
				; LSE-NEXT: // %bb.2: // %atomicrmw.end
				; LSE-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload
				; LSE-NEXT: add sp, sp, #32 // =32
				; LSE-NEXT: ret
				%val = atomicrmw nand i32* %p, i32 7 release
				ret i32 %val
				}

				define i64 @fetch_and_nand_64(i64* %p) {
				; CHECK-LABEL: fetch_and_nand_64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: sub sp, sp, #32 // =32
				; CHECK-NEXT: .cfi_def_cfa_offset 32
				; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; CHECK-NEXT: ldr x8, [x0]
				; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
				; CHECK-NEXT: b .LBB1_1
				; CHECK-NEXT: .LBB1_1: // %atomicrmw.start
				; CHECK-NEXT: // =>This Loop Header: Depth=1
				; CHECK-NEXT: // Child Loop BB1_2 Depth 2
				; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
				; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; CHECK-NEXT: mov w8, w9
				; CHECK-NEXT: mvn w10, w8
				; CHECK-NEXT: // implicit-def: $x8
				; CHECK-NEXT: mov w8, w10
				; CHECK-NEXT: orr x12, x8, #0xfffffffffffffff8
				; CHECK-NEXT: .LBB1_2: // %atomicrmw.start
				; CHECK-NEXT: // Parent Loop BB1_1 Depth=1
				; CHECK-NEXT: // => This Inner Loop Header: Depth=2
				; CHECK-NEXT: ldaxr x8, [x11]
				; CHECK-NEXT: cmp x8, x9
				; CHECK-NEXT: b.ne .LBB1_4
				; CHECK-NEXT: // %bb.3: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=2
				; CHECK-NEXT: stlxr w10, x12, [x11]
				; CHECK-NEXT: cbnz w10, .LBB1_2
				; CHECK-NEXT: .LBB1_4: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1
				; CHECK-NEXT: subs x9, x8, x9
				; CHECK-NEXT: cset w9, eq
				; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
				; CHECK-NEXT: subs w9, w9, #1 // =1
				; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
				; CHECK-NEXT: b.ne .LBB1_1
				; CHECK-NEXT: b .LBB1_5
				; CHECK-NEXT: .LBB1_5: // %atomicrmw.end
				; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload
				; CHECK-NEXT: add sp, sp, #32 // =32
				; CHECK-NEXT: ret
				;
				; LSE-LABEL: fetch_and_nand_64:
				; LSE: // %bb.0:
				; LSE-NEXT: sub sp, sp, #32 // =32
				; LSE-NEXT: .cfi_def_cfa_offset 32
				; LSE-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; LSE-NEXT: ldr x8, [x0]
				; LSE-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
				; LSE-NEXT: .LBB1_1: // %atomicrmw.start
				; LSE-NEXT: // =>This Inner Loop Header: Depth=1
				; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload
				; LSE-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; LSE-NEXT: and x9, x8, #0x7
				; LSE-NEXT: mvn x10, x9
				; LSE-NEXT: mov x9, x8
				; LSE-NEXT: casal x9, x10, [x11]
				; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
				; LSE-NEXT: subs x8, x9, x8
				; LSE-NEXT: cset w8, eq
				; LSE-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
				; LSE-NEXT: tbz w8, #0, .LBB1_1
				; LSE-NEXT: // %bb.2: // %atomicrmw.end
				; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload
				; LSE-NEXT: add sp, sp, #32 // =32
				; LSE-NEXT: ret
				%val = atomicrmw nand i64* %p, i64 7 acq_rel
				ret i64 %val
				}

				define i32 @fetch_and_or(i32* %p) {
				; CHECK-LABEL: fetch_and_or:
				; CHECK: // %bb.0:
				; CHECK-NEXT: sub sp, sp, #32 // =32
				; CHECK-NEXT: .cfi_def_cfa_offset 32
				; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; CHECK-NEXT: ldr w8, [x0]
				; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill
				; CHECK-NEXT: b .LBB2_1
				; CHECK-NEXT: .LBB2_1: // %atomicrmw.start
				; CHECK-NEXT: // =>This Loop Header: Depth=1
				; CHECK-NEXT: // Child Loop BB2_2 Depth 2
				; CHECK-NEXT: ldr w9, [sp, #28] // 4-byte Folded Reload
				; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; CHECK-NEXT: mov w8, #5
				; CHECK-NEXT: orr w12, w9, w8
				; CHECK-NEXT: .LBB2_2: // %atomicrmw.start
				; CHECK-NEXT: // Parent Loop BB2_1 Depth=1
				; CHECK-NEXT: // => This Inner Loop Header: Depth=2
				; CHECK-NEXT: ldaxr w8, [x11]
				; CHECK-NEXT: cmp w8, w9
				; CHECK-NEXT: b.ne .LBB2_4
				; CHECK-NEXT: // %bb.3: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB2_2 Depth=2
				; CHECK-NEXT: stlxr w10, w12, [x11]
				; CHECK-NEXT: cbnz w10, .LBB2_2
				; CHECK-NEXT: .LBB2_4: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
				; CHECK-NEXT: subs w9, w8, w9
				; CHECK-NEXT: cset w9, eq
				; CHECK-NEXT: str w8, [sp, #12] // 4-byte Folded Spill
				; CHECK-NEXT: subs w9, w9, #1 // =1
				; CHECK-NEXT: str w8, [sp, #28] // 4-byte Folded Spill
				; CHECK-NEXT: b.ne .LBB2_1
				; CHECK-NEXT: b .LBB2_5
				; CHECK-NEXT: .LBB2_5: // %atomicrmw.end
				; CHECK-NEXT: ldr w0, [sp, #12] // 4-byte Folded Reload
				; CHECK-NEXT: add sp, sp, #32 // =32
				; CHECK-NEXT: ret
				;
				; LSE-LABEL: fetch_and_or:
				; LSE: // %bb.0:
				; LSE-NEXT: mov w8, #5
				; LSE-NEXT: ldsetal w8, w0, [x0]
				; LSE-NEXT: ret
				%val = atomicrmw or i32* %p, i32 5 seq_cst
				ret i32 %val
				}

				define i64 @fetch_and_or_64(i64* %p) {
				; CHECK-LABEL: fetch_and_or_64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: sub sp, sp, #32 // =32
				; CHECK-NEXT: .cfi_def_cfa_offset 32
				; CHECK-NEXT: str x0, [sp, #16] // 8-byte Folded Spill
				; CHECK-NEXT: ldr x8, [x0]
				; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
				; CHECK-NEXT: b .LBB3_1
				; CHECK-NEXT: .LBB3_1: // %atomicrmw.start
				; CHECK-NEXT: // =>This Loop Header: Depth=1
				; CHECK-NEXT: // Child Loop BB3_2 Depth 2
				; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
				; CHECK-NEXT: ldr x11, [sp, #16] // 8-byte Folded Reload
				; CHECK-NEXT: orr x12, x9, #0x7
				; CHECK-NEXT: .LBB3_2: // %atomicrmw.start
				; CHECK-NEXT: // Parent Loop BB3_1 Depth=1
				; CHECK-NEXT: // => This Inner Loop Header: Depth=2
				; CHECK-NEXT: ldaxr x8, [x11]
				; CHECK-NEXT: cmp x8, x9
				; CHECK-NEXT: b.ne .LBB3_4
				; CHECK-NEXT: // %bb.3: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB3_2 Depth=2
				; CHECK-NEXT: stlxr w10, x12, [x11]
				; CHECK-NEXT: cbnz w10, .LBB3_2
				; CHECK-NEXT: .LBB3_4: // %atomicrmw.start
				; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
				; CHECK-NEXT: subs x9, x8, x9
				; CHECK-NEXT: cset w9, eq
				; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
				; CHECK-NEXT: subs w9, w9, #1 // =1
				; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill
				; CHECK-NEXT: b.ne .LBB3_1
				; CHECK-NEXT: b .LBB3_5
				; CHECK-NEXT: .LBB3_5: // %atomicrmw.end
				; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload
				; CHECK-NEXT: add sp, sp, #32 // =32
				; CHECK-NEXT: ret
				;
				; LSE-LABEL: fetch_and_or_64:
				; LSE: // %bb.0:
				; LSE-NEXT: mov w8, #7
				; LSE-NEXT: // kill: def $x8 killed $w8
				; LSE-NEXT: ldset x8, x0, [x0]
				; LSE-NEXT: ret
				%val = atomicrmw or i64* %p, i64 7 monotonic
				ret i64 %val
				}