This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use movq for i64 atomic load on 32-bit targets when sse2 is enable
ClosedPublic

Authored by craig.topper on Mar 21 2019, 5:22 PM.

Download Raw Diff

Details

Reviewers

reames
jfb
RKSimon
efriedma

Commits

rG1ffd8e8114bc: [X86] Use movq for i64 atomic load on 32-bit targets when sse2 is enable
rL356807: [X86] Use movq for i64 atomic load on 32-bit targets when sse2 is enable

Summary

We used a lock cmpxchg8b to do i64 atomic loads. But if we have SSE2 we can do better and use a plain movq to do the load instead.

I tried to just use an f64 atomic load and add isel patterns to MOVSD(which the domain fixing pass can turn to MOVQ), but the atomic_load SDNode in TargetSelectionDAG.td requires the type to be integer.

So I've emitted VZEXT_LOAD instead which should be selected by isel to a MOVQ. Hopefully we don't need a specific atomic flavor of this. I kept the memory operand from the original AtomicSDNode. I wasn't sure if I might need to set the MOVolatile flag?

I've left some FIXMEs for improvements we can do without SSE2.

Diff Detail

Event Timeline

craig.topper created this revision.Mar 21 2019, 5:22 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 21 2019, 5:22 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

From what I understand, the atomic marking in the memory operand should be enough (assuming there aren't any relevant target-specific DAGCombines).

Maybe also add CHECK lines to test/CodeGen/X86/atomic-non-integer.ll?

llvm/test/CodeGen/X86/atomic-load-store-wide.ll
41 ↗	(On Diff #191806)	Maybe we should keep test coverage for old CPUs? Or is that unlikely to be useful?

Update atomic-non-integer.ll too. Add an old CPU command line to atomic-load-store-wide.ll

Do we need to check for NoImplicitFloat somewhere?

Block the transform if noimplicitfloat is set on the function.

LGTM

This revision is now accepted and ready to land.Mar 22 2019, 1:10 PM

Closed by commit rL356807: [X86] Use movq for i64 atomic load on 32-bit targets when sse2 is enable (authored by ctopper). · Explain WhyMar 22 2019, 1:46 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

42 lines

test/

CodeGen/

X86/

atomic-load-store-wide.ll

49 lines

atomic-non-integer.ll

50 lines

Diff 191822

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 479 Lines • ▼ Show 20 Lines	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);		setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);		setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);		setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);		setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);		setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);		setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}		}

		if (!Subtarget.is64Bit())
		setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

if (Subtarget.hasCmpxchg16b()) {		if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);		setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}		}

// FIXME - use subtarget debug flags		// FIXME - use subtarget debug flags
if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&		if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&		!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
TM.Options.ExceptionModel != ExceptionHandling::SjLj) {		TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
▲ Show 20 Lines • Show All 24,993 Lines • ▼ Show 20 Lines	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
return false;		return false;
}		}

bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {		bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return needsCmpXchgNb(SI->getValueOperand()->getType());		return needsCmpXchgNb(SI->getValueOperand()->getType());
}		}

// Note: this turns large loads into lock cmpxchg8b/16b.		// Note: this turns large loads into lock cmpxchg8b/16b.
// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.		// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
		// TODO: In 32-bit mode, use FILD/FISTP when X87 is available?
TargetLowering::AtomicExpansionKind		TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {		X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return needsCmpXchgNb(LI->getType()) ? AtomicExpansionKind::CmpXChg		Type *MemType = LI->getType();

		// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
		// can use movq to do the load.
		if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
		isTypeLegal(MVT::v2i64))
		return AtomicExpansionKind::None;

		return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;		: AtomicExpansionKind::None;
}		}

TargetLowering::AtomicExpansionKind		TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {		X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;		unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();		Type *MemType = AI->getType();

// If the operand is too big, we must see if cmpxchg8/16b is available		// If the operand is too big, we must see if cmpxchg8/16b is available
▲ Show 20 Lines • Show All 1,797 Lines • ▼ Show 20 Lines	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);		SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));		Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));		Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
Results.push_back(Success);		Results.push_back(Success);
Results.push_back(EFLAGS.getValue(1));		Results.push_back(EFLAGS.getValue(1));
return;		return;
}		}
		case ISD::ATOMIC_LOAD: {
		if (isTypeLegal(MVT::v2i64)) {
		auto *Node = cast<AtomicSDNode>(N);
		// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the lower
		// 64-bits.
		SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
		SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
		SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
		MVT::i64, Node->getMemOperand());
		SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
		DAG.getIntPtrConstant(0, dl));
		Results.push_back(Res);
		Results.push_back(Ld.getValue(1));
		return;
		}
		// TODO: Use MOVLPS when SSE1 is available?
		// TODO: Use FILD/FISTP when X87 is available?
		// Delegate to generic TypeLegalization. Situations we can really handle
		// should have already been dealt with by AtomicExpandPass.cpp.
		break;
		}
case ISD::ATOMIC_SWAP:		case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:		case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:		case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_AND:		case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_OR:		case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:		case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_NAND:		case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_MIN:		case ISD::ATOMIC_LOAD_MIN:
case ISD::ATOMIC_LOAD_MAX:		case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:		case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:		case ISD::ATOMIC_LOAD_UMAX:
case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle		// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.		// should have already been dealt with by AtomicExpandPass.cpp.
break;		break;
}
case ISD::BITCAST: {		case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");		assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);		EVT DstVT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();		EVT SrcVT = N->getOperand(0).getValueType();

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target		// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
// we can split using the k-register rather than memory.		// we can split using the k-register rather than memory.
if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {		if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
▲ Show 20 Lines • Show All 16,509 Lines • Show Last 20 Lines

test/CodeGen/X86/atomic-load-store-wide.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mcpu=corei7 -mtriple=i686-- -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -mcpu=corei7 -mtriple=i686-- -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
				; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=NOSSE

	; 64-bit load/store on x86-32			; 64-bit load/store on x86-32
	; FIXME: The generated code can be substantially improved.			; FIXME: The generated code can be substantially improved.

	define void @test1(i64* %ptr, i64 %val1) {			define void @test1(i64* %ptr, i64 %val1) {
	; CHECK-LABEL: test1:			; CHECK-LABEL: test1:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: pushl %ebx			; CHECK-NEXT: pushl %ebx
	Show All 18 Lines
	; CHECK-NEXT: popl %ebx			; CHECK-NEXT: popl %ebx
	; CHECK-NEXT: .cfi_def_cfa_offset 4			; CHECK-NEXT: .cfi_def_cfa_offset 4
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	store atomic i64 %val1, i64* %ptr seq_cst, align 8			store atomic i64 %val1, i64* %ptr seq_cst, align 8
	ret void			ret void
	}			}

	define i64 @test2(i64* %ptr) {			define i64 @test2(i64* %ptr) {
	; CHECK-LABEL: test2:			; SSE42-LABEL: test2:
	; CHECK: # %bb.0:			; SSE42: # %bb.0:
	; CHECK-NEXT: pushl %ebx			; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: .cfi_def_cfa_offset 8			; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
	; CHECK-NEXT: pushl %esi			; SSE42-NEXT: movd %xmm0, %eax
	; CHECK-NEXT: .cfi_def_cfa_offset 12			; SSE42-NEXT: pextrd $1, %xmm0, %edx
	; CHECK-NEXT: .cfi_offset %esi, -12			; SSE42-NEXT: retl
	; CHECK-NEXT: .cfi_offset %ebx, -8			;
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi			; NOSSE-LABEL: test2:
	; CHECK-NEXT: xorl %eax, %eax			; NOSSE: # %bb.0:
	; CHECK-NEXT: xorl %edx, %edx			; NOSSE-NEXT: pushl %ebx
	; CHECK-NEXT: xorl %ecx, %ecx			; NOSSE-NEXT: .cfi_def_cfa_offset 8
	; CHECK-NEXT: xorl %ebx, %ebx			; NOSSE-NEXT: pushl %esi
	; CHECK-NEXT: lock cmpxchg8b (%esi)			; NOSSE-NEXT: .cfi_def_cfa_offset 12
	; CHECK-NEXT: popl %esi			; NOSSE-NEXT: .cfi_offset %esi, -12
	; CHECK-NEXT: .cfi_def_cfa_offset 8			; NOSSE-NEXT: .cfi_offset %ebx, -8
	; CHECK-NEXT: popl %ebx			; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
	; CHECK-NEXT: .cfi_def_cfa_offset 4			; NOSSE-NEXT: xorl %eax, %eax
	; CHECK-NEXT: retl			; NOSSE-NEXT: xorl %edx, %edx
				; NOSSE-NEXT: xorl %ecx, %ecx
				; NOSSE-NEXT: xorl %ebx, %ebx
				; NOSSE-NEXT: lock cmpxchg8b (%esi)
				; NOSSE-NEXT: popl %esi
				; NOSSE-NEXT: .cfi_def_cfa_offset 8
				; NOSSE-NEXT: popl %ebx
				; NOSSE-NEXT: .cfi_def_cfa_offset 4
				; NOSSE-NEXT: retl
	%val = load atomic i64, i64* %ptr seq_cst, align 8			%val = load atomic i64, i64* %ptr seq_cst, align 8
	ret i64 %val			ret i64 %val
	}			}

test/CodeGen/X86/atomic-non-integer.ll

	Show First 20 Lines • Show All 206 Lines • ▼ Show 20 Lines
	; X64-NEXT: retq			; X64-NEXT: retq
	%v = load atomic float, float* %fptr unordered, align 4			%v = load atomic float, float* %fptr unordered, align 4
	ret float %v			ret float %v
	}			}

	define double @load_double(double* %fptr) {			define double @load_double(double* %fptr) {
	; X86-SSE-LABEL: load_double:			; X86-SSE-LABEL: load_double:
	; X86-SSE: # %bb.0:			; X86-SSE: # %bb.0:
	; X86-SSE-NEXT: pushl %ebx
	; X86-SSE-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE-NEXT: pushl %esi
	; X86-SSE-NEXT: .cfi_def_cfa_offset 12
	; X86-SSE-NEXT: subl $12, %esp			; X86-SSE-NEXT: subl $12, %esp
	; X86-SSE-NEXT: .cfi_def_cfa_offset 24			; X86-SSE-NEXT: .cfi_def_cfa_offset 16
	; X86-SSE-NEXT: .cfi_offset %esi, -12			; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE-NEXT: .cfi_offset %ebx, -8			; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
	; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi			; X86-SSE-NEXT: movlps %xmm0, (%esp)
	; X86-SSE-NEXT: xorl %eax, %eax
	; X86-SSE-NEXT: xorl %edx, %edx
	; X86-SSE-NEXT: xorl %ecx, %ecx
	; X86-SSE-NEXT: xorl %ebx, %ebx
	; X86-SSE-NEXT: lock cmpxchg8b (%esi)
	; X86-SSE-NEXT: movd %edx, %xmm0
	; X86-SSE-NEXT: movd %eax, %xmm1
	; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	; X86-SSE-NEXT: movq %xmm1, (%esp)
	; X86-SSE-NEXT: fldl (%esp)			; X86-SSE-NEXT: fldl (%esp)
	; X86-SSE-NEXT: addl $12, %esp			; X86-SSE-NEXT: addl $12, %esp
	; X86-SSE-NEXT: .cfi_def_cfa_offset 12
	; X86-SSE-NEXT: popl %esi
	; X86-SSE-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE-NEXT: popl %ebx
	; X86-SSE-NEXT: .cfi_def_cfa_offset 4			; X86-SSE-NEXT: .cfi_def_cfa_offset 4
	; X86-SSE-NEXT: retl			; X86-SSE-NEXT: retl
	;			;
	; X86-NOSSE-LABEL: load_double:			; X86-NOSSE-LABEL: load_double:
	; X86-NOSSE: # %bb.0:			; X86-NOSSE: # %bb.0:
	; X86-NOSSE-NEXT: pushl %ebx			; X86-NOSSE-NEXT: pushl %ebx
	; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8			; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
	; X86-NOSSE-NEXT: pushl %esi			; X86-NOSSE-NEXT: pushl %esi
	▲ Show 20 Lines • Show All 188 Lines • ▼ Show 20 Lines
	; X64-NEXT: retq			; X64-NEXT: retq
	%v = load atomic float, float* %fptr seq_cst, align 4			%v = load atomic float, float* %fptr seq_cst, align 4
	ret float %v			ret float %v
	}			}

	define double @load_double_seq_cst(double* %fptr) {			define double @load_double_seq_cst(double* %fptr) {
	; X86-SSE-LABEL: load_double_seq_cst:			; X86-SSE-LABEL: load_double_seq_cst:
	; X86-SSE: # %bb.0:			; X86-SSE: # %bb.0:
	; X86-SSE-NEXT: pushl %ebx
	; X86-SSE-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE-NEXT: pushl %esi
	; X86-SSE-NEXT: .cfi_def_cfa_offset 12
	; X86-SSE-NEXT: subl $12, %esp			; X86-SSE-NEXT: subl $12, %esp
	; X86-SSE-NEXT: .cfi_def_cfa_offset 24			; X86-SSE-NEXT: .cfi_def_cfa_offset 16
	; X86-SSE-NEXT: .cfi_offset %esi, -12			; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE-NEXT: .cfi_offset %ebx, -8			; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
	; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi			; X86-SSE-NEXT: movlps %xmm0, (%esp)
	; X86-SSE-NEXT: xorl %eax, %eax
	; X86-SSE-NEXT: xorl %edx, %edx
	; X86-SSE-NEXT: xorl %ecx, %ecx
	; X86-SSE-NEXT: xorl %ebx, %ebx
	; X86-SSE-NEXT: lock cmpxchg8b (%esi)
	; X86-SSE-NEXT: movd %edx, %xmm0
	; X86-SSE-NEXT: movd %eax, %xmm1
	; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	; X86-SSE-NEXT: movq %xmm1, (%esp)
	; X86-SSE-NEXT: fldl (%esp)			; X86-SSE-NEXT: fldl (%esp)
	; X86-SSE-NEXT: addl $12, %esp			; X86-SSE-NEXT: addl $12, %esp
	; X86-SSE-NEXT: .cfi_def_cfa_offset 12
	; X86-SSE-NEXT: popl %esi
	; X86-SSE-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE-NEXT: popl %ebx
	; X86-SSE-NEXT: .cfi_def_cfa_offset 4			; X86-SSE-NEXT: .cfi_def_cfa_offset 4
	; X86-SSE-NEXT: retl			; X86-SSE-NEXT: retl
	;			;
	; X86-NOSSE-LABEL: load_double_seq_cst:			; X86-NOSSE-LABEL: load_double_seq_cst:
	; X86-NOSSE: # %bb.0:			; X86-NOSSE: # %bb.0:
	; X86-NOSSE-NEXT: pushl %ebx			; X86-NOSSE-NEXT: pushl %ebx
	; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8			; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
	; X86-NOSSE-NEXT: pushl %esi			; X86-NOSSE-NEXT: pushl %esi
	Show All 30 Lines