This is an archive of the discontinued LLVM Phabricator instance.

[X86] X86CallFrameOptimization - generalize slow push code path
ClosedPublic

Authored by RKSimon on Mar 16 2020, 10:40 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel

Commits

rGa7115d51be09: [X86] X86CallFrameOptimization - generalize slow push code path

Summary

Replace the explicit isAtom() || isSLM() test with the more general (and more specific) slowTwoMemOps() check to avoid the use of the PUSHrmm push from memory case.

This is actually very tricky to test in anything but quite complex code, but the atomic-idempotent.ll tests seem to be the most straightforward to use. If people are happy with me using these tests I can add the extra target checks as a pre-commit.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

RKSimon created this revision.Mar 16 2020, 10:40 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 16 2020, 10:40 AM

Herald added subscribers: jfb, hiraditya. · View Herald Transcript

Harbormaster failed remote builds in B49335: Diff 250596!Mar 16 2020, 11:27 AM

craig.topper added inline comments.Mar 17 2020, 4:14 PM

llvm/test/CodeGen/X86/atomic-idempotent.ll
4	Why did SLM/GLM/KNL lose sse2, but atom got to keep it?

disabled sse2 on atom to match the other targets

RKSimon marked an inline comment as done.Mar 26 2020, 12:38 PM

RKSimon added inline comments.

llvm/test/CodeGen/X86/atomic-idempotent.ll
4	Disabling sse2 was more of a technicality to force the SLM-like targets to match, but I've set atom to follow this as well to stop the fence insertion

Harbormaster completed remote builds in B50600: Diff 252949.Mar 26 2020, 1:39 PM

LGTM

This revision is now accepted and ready to land.Mar 28 2020, 1:43 PM

Closed by commit rGa7115d51be09: [X86] X86CallFrameOptimization - generalize slow push code path (authored by RKSimon). · Explain WhyMar 29 2020, 3:10 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86CallFrameOptimization.cpp

2 lines

test/

CodeGen/

X86/

atomic-idempotent.ll

547 lines

Diff 252949

llvm/lib/Target/X86/X86CallFrameOptimization.cpp

Show First 20 Lines • Show All 543 Lines • ▼ Show 20 Lines	case X86::MOV64mr: {
BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)		BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
.addReg(UndefReg)		.addReg(UndefReg)
.add(PushOp)		.add(PushOp)
.addImm(X86::sub_32bit);		.addImm(X86::sub_32bit);
}		}

// If PUSHrmm is not slow on this target, try to fold the source of the		// If PUSHrmm is not slow on this target, try to fold the source of the
// push into the instruction.		// push into the instruction.
bool SlowPUSHrmm = STI->isAtom() \|\| STI->isSLM();		bool SlowPUSHrmm = STI->slowTwoMemOps();

// Check that this is legal to fold. Right now, we're extremely		// Check that this is legal to fold. Right now, we're extremely
// conservative about that.		// conservative about that.
MachineInstr *DefMov = nullptr;		MachineInstr *DefMov = nullptr;
if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {		if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;		PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));		Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));

▲ Show 20 Lines • Show All 77 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/atomic-idempotent.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs \| FileCheck %s --check-prefixes=CHECK,X64
	; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=X86			; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SSE2
				; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2 \| FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
				craig.topperUnsubmitted Not Done Reply Inline Actions Why did SLM/GLM/KNL lose sse2, but atom got to keep it? craig.topper: Why did SLM/GLM/KNL lose sse2, but atom got to keep it?
				RKSimonAuthorUnsubmitted Done Reply Inline Actions Disabling sse2 was more of a technicality to force the SLM-like targets to match, but I've set atom to follow this as well to stop the fence insertion RKSimon: Disabling sse2 was more of a technicality to force the SLM-like targets to match, but I've set…
				; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 \| FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
				; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2 \| FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
				; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2 \| FileCheck %s --check-prefixes=CHECK,X86,X86-ATOM

	; On x86, an atomic rmw operation that does not modify the value in memory			; On x86, an atomic rmw operation that does not modify the value in memory
	; (such as atomic add 0) can be replaced by an mfence followed by a mov.			; (such as atomic add 0) can be replaced by an mfence followed by a mov.
	; This is explained (with the motivation for such an optimization) in			; This is explained (with the motivation for such an optimization) in
	; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf			; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf

	define i8 @add8(i8* %p) {			define i8 @add8(i8* %p) {
	; X64-LABEL: add8:			; X64-LABEL: add8:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: mfence			; X64-NEXT: mfence
	; X64-NEXT: movb (%rdi), %al			; X64-NEXT: movb (%rdi), %al
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: add8:			; X86-SSE2-LABEL: add8:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: mfence			; X86-SSE2-NEXT: mfence
	; X86-NEXT: movb (%eax), %al			; X86-SSE2-NEXT: movb (%eax), %al
	; X86-NEXT: retl			; X86-SSE2-NEXT: retl
				;
				; X86-SLM-LABEL: add8:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SLM-NEXT: xorl %eax, %eax
				; X86-SLM-NEXT: lock xaddb %al, (%ecx)
				; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
				; X86-SLM-NEXT: retl
				;
				; X86-ATOM-LABEL: add8:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-ATOM-NEXT: xorl %eax, %eax
				; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
				; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	%1 = atomicrmw add i8* %p, i8 0 monotonic			%1 = atomicrmw add i8* %p, i8 0 monotonic
	ret i8 %1			ret i8 %1
	}			}

	define i16 @or16(i16* %p) {			define i16 @or16(i16* %p) {
	; X64-LABEL: or16:			; X64-LABEL: or16:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: mfence			; X64-NEXT: mfence
	; X64-NEXT: movzwl (%rdi), %eax			; X64-NEXT: movzwl (%rdi), %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or16:			; X86-SSE2-LABEL: or16:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: mfence			; X86-SSE2-NEXT: mfence
	; X86-NEXT: movzwl (%eax), %eax			; X86-SSE2-NEXT: movzwl (%eax), %eax
	; X86-NEXT: retl			; X86-SSE2-NEXT: retl
				;
				; X86-SLM-LABEL: or16:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SLM-NEXT: movzwl (%ecx), %eax
				; X86-SLM-NEXT: .p2align 4, 0x90
				; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
				; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
				; X86-SLM-NEXT: jne .LBB1_1
				; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-SLM-NEXT: retl
				;
				; X86-ATOM-LABEL: or16:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-ATOM-NEXT: movzwl (%ecx), %eax
				; X86-ATOM-NEXT: .p2align 4, 0x90
				; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
				; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
				; X86-ATOM-NEXT: jne .LBB1_1
				; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-ATOM-NEXT: retl
	%1 = atomicrmw or i16* %p, i16 0 acquire			%1 = atomicrmw or i16* %p, i16 0 acquire
	ret i16 %1			ret i16 %1
	}			}

	define i32 @xor32(i32* %p) {			define i32 @xor32(i32* %p) {
	; X64-LABEL: xor32:			; X64-LABEL: xor32:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: mfence			; X64-NEXT: mfence
	; X64-NEXT: movl (%rdi), %eax			; X64-NEXT: movl (%rdi), %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: xor32:			; X86-SSE2-LABEL: xor32:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: mfence			; X86-SSE2-NEXT: mfence
	; X86-NEXT: movl (%eax), %eax			; X86-SSE2-NEXT: movl (%eax), %eax
	; X86-NEXT: retl			; X86-SSE2-NEXT: retl
				;
				; X86-SLM-LABEL: xor32:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SLM-NEXT: movl (%ecx), %eax
				; X86-SLM-NEXT: .p2align 4, 0x90
				; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
				; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
				; X86-SLM-NEXT: jne .LBB2_1
				; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-SLM-NEXT: retl
				;
				; X86-ATOM-LABEL: xor32:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-ATOM-NEXT: movl (%ecx), %eax
				; X86-ATOM-NEXT: .p2align 4, 0x90
				; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
				; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
				; X86-ATOM-NEXT: jne .LBB2_1
				; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-ATOM-NEXT: retl
	%1 = atomicrmw xor i32* %p, i32 0 release			%1 = atomicrmw xor i32* %p, i32 0 release
	ret i32 %1			ret i32 %1
	}			}

	define i64 @sub64(i64* %p) {			define i64 @sub64(i64* %p) {
	; X64-LABEL: sub64:			; X64-LABEL: sub64:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: mfence			; X64-NEXT: mfence
	Show All 35 Lines
	; X64-NEXT: .cfi_def_cfa_offset 16			; X64-NEXT: .cfi_def_cfa_offset 16
	; X64-NEXT: xorl %esi, %esi			; X64-NEXT: xorl %esi, %esi
	; X64-NEXT: xorl %edx, %edx			; X64-NEXT: xorl %edx, %edx
	; X64-NEXT: callq __sync_fetch_and_or_16			; X64-NEXT: callq __sync_fetch_and_or_16
	; X64-NEXT: popq %rcx			; X64-NEXT: popq %rcx
	; X64-NEXT: .cfi_def_cfa_offset 8			; X64-NEXT: .cfi_def_cfa_offset 8
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or128:			; X86-SSE2-LABEL: or128:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: pushl %ebp			; X86-SSE2-NEXT: pushl %ebp
	; X86-NEXT: .cfi_def_cfa_offset 8			; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: .cfi_offset %ebp, -8			; X86-SSE2-NEXT: .cfi_offset %ebp, -8
	; X86-NEXT: movl %esp, %ebp			; X86-SSE2-NEXT: movl %esp, %ebp
	; X86-NEXT: .cfi_def_cfa_register %ebp			; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
	; X86-NEXT: pushl %edi			; X86-SSE2-NEXT: pushl %edi
	; X86-NEXT: pushl %esi			; X86-SSE2-NEXT: pushl %esi
	; X86-NEXT: andl $-8, %esp			; X86-SSE2-NEXT: andl $-8, %esp
	; X86-NEXT: subl $16, %esp			; X86-SSE2-NEXT: subl $16, %esp
	; X86-NEXT: .cfi_offset %esi, -16			; X86-SSE2-NEXT: .cfi_offset %esi, -16
	; X86-NEXT: .cfi_offset %edi, -12			; X86-SSE2-NEXT: .cfi_offset %edi, -12
	; X86-NEXT: movl 8(%ebp), %esi			; X86-SSE2-NEXT: movl 8(%ebp), %esi
	; X86-NEXT: movl %esp, %eax			; X86-SSE2-NEXT: movl %esp, %eax
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl 12(%ebp)			; X86-SSE2-NEXT: pushl 12(%ebp)
	; X86-NEXT: pushl %eax			; X86-SSE2-NEXT: pushl %eax
	; X86-NEXT: calll __sync_fetch_and_or_16			; X86-SSE2-NEXT: calll __sync_fetch_and_or_16
	; X86-NEXT: addl $20, %esp			; X86-SSE2-NEXT: addl $20, %esp
	; X86-NEXT: movl (%esp), %eax			; X86-SSE2-NEXT: movl (%esp), %eax
	; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-NEXT: movl {{[0-9]+}}(%esp), %edi			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
	; X86-NEXT: movl %edi, 8(%esi)			; X86-SSE2-NEXT: movl %edi, 8(%esi)
	; X86-NEXT: movl %edx, 12(%esi)			; X86-SSE2-NEXT: movl %edx, 12(%esi)
	; X86-NEXT: movl %eax, (%esi)			; X86-SSE2-NEXT: movl %eax, (%esi)
	; X86-NEXT: movl %ecx, 4(%esi)			; X86-SSE2-NEXT: movl %ecx, 4(%esi)
	; X86-NEXT: movl %esi, %eax			; X86-SSE2-NEXT: movl %esi, %eax
	; X86-NEXT: leal -8(%ebp), %esp			; X86-SSE2-NEXT: leal -8(%ebp), %esp
	; X86-NEXT: popl %esi			; X86-SSE2-NEXT: popl %esi
	; X86-NEXT: popl %edi			; X86-SSE2-NEXT: popl %edi
	; X86-NEXT: popl %ebp			; X86-SSE2-NEXT: popl %ebp
	; X86-NEXT: .cfi_def_cfa %esp, 4			; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
	; X86-NEXT: retl $4			; X86-SSE2-NEXT: retl $4
				;
				; X86-SLM-LABEL: or128:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: pushl %ebp
				; X86-SLM-NEXT: .cfi_def_cfa_offset 8
				; X86-SLM-NEXT: .cfi_offset %ebp, -8
				; X86-SLM-NEXT: movl %esp, %ebp
				; X86-SLM-NEXT: .cfi_def_cfa_register %ebp
				; X86-SLM-NEXT: pushl %edi
				; X86-SLM-NEXT: pushl %esi
				; X86-SLM-NEXT: andl $-8, %esp
				; X86-SLM-NEXT: subl $16, %esp
				; X86-SLM-NEXT: .cfi_offset %esi, -16
				; X86-SLM-NEXT: .cfi_offset %edi, -12
				; X86-SLM-NEXT: movl 8(%ebp), %esi
				; X86-SLM-NEXT: movl 12(%ebp), %eax
				; X86-SLM-NEXT: movl %esp, %ecx
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl %eax
				; X86-SLM-NEXT: pushl %ecx
				; X86-SLM-NEXT: calll __sync_fetch_and_or_16
				; X86-SLM-NEXT: addl $20, %esp
				; X86-SLM-NEXT: movl (%esp), %eax
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edx
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edi
				; X86-SLM-NEXT: movl %edi, 8(%esi)
				; X86-SLM-NEXT: movl %edx, 12(%esi)
				; X86-SLM-NEXT: movl %eax, (%esi)
				; X86-SLM-NEXT: movl %ecx, 4(%esi)
				; X86-SLM-NEXT: movl %esi, %eax
				; X86-SLM-NEXT: leal -8(%ebp), %esp
				; X86-SLM-NEXT: popl %esi
				; X86-SLM-NEXT: popl %edi
				; X86-SLM-NEXT: popl %ebp
				; X86-SLM-NEXT: .cfi_def_cfa %esp, 4
				; X86-SLM-NEXT: retl $4
				;
				; X86-ATOM-LABEL: or128:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: pushl %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa_offset 8
				; X86-ATOM-NEXT: .cfi_offset %ebp, -8
				; X86-ATOM-NEXT: leal (%esp), %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa_register %ebp
				; X86-ATOM-NEXT: pushl %edi
				; X86-ATOM-NEXT: pushl %esi
				; X86-ATOM-NEXT: andl $-8, %esp
				; X86-ATOM-NEXT: leal -{{[0-9]+}}(%esp), %esp
				; X86-ATOM-NEXT: .cfi_offset %esi, -16
				; X86-ATOM-NEXT: .cfi_offset %edi, -12
				; X86-ATOM-NEXT: movl 8(%ebp), %esi
				; X86-ATOM-NEXT: movl 12(%ebp), %eax
				; X86-ATOM-NEXT: movl %esp, %ecx
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl %eax
				; X86-ATOM-NEXT: pushl %ecx
				; X86-ATOM-NEXT: calll __sync_fetch_and_or_16
				; X86-ATOM-NEXT: leal {{[0-9]+}}(%esp), %esp
				; X86-ATOM-NEXT: movl (%esp), %ecx
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edi
				; X86-ATOM-NEXT: movl %eax, 8(%esi)
				; X86-ATOM-NEXT: movl %edi, 12(%esi)
				; X86-ATOM-NEXT: movl %ecx, (%esi)
				; X86-ATOM-NEXT: movl %esi, %eax
				; X86-ATOM-NEXT: movl %edx, 4(%esi)
				; X86-ATOM-NEXT: leal -8(%ebp), %esp
				; X86-ATOM-NEXT: popl %esi
				; X86-ATOM-NEXT: popl %edi
				; X86-ATOM-NEXT: popl %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa %esp, 4
				; X86-ATOM-NEXT: retl $4
	%1 = atomicrmw or i128* %p, i128 0 monotonic			%1 = atomicrmw or i128* %p, i128 0 monotonic
	ret i128 %1			ret i128 %1
	}			}

	; For 'and', the idempotent value is (-1)			; For 'and', the idempotent value is (-1)
	define i32 @and32 (i32* %p) {			define i32 @and32 (i32* %p) {
	; X64-LABEL: and32:			; X64-LABEL: and32:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: mfence			; X64-NEXT: mfence
	; X64-NEXT: movl (%rdi), %eax			; X64-NEXT: movl (%rdi), %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: and32:			; X86-SSE2-LABEL: and32:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: mfence			; X86-SSE2-NEXT: mfence
	; X86-NEXT: movl (%eax), %eax			; X86-SSE2-NEXT: movl (%eax), %eax
	; X86-NEXT: retl			; X86-SSE2-NEXT: retl
				;
				; X86-SLM-LABEL: and32:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SLM-NEXT: movl (%ecx), %eax
				; X86-SLM-NEXT: .p2align 4, 0x90
				; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
				; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
				; X86-SLM-NEXT: jne .LBB5_1
				; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-SLM-NEXT: retl
				;
				; X86-ATOM-LABEL: and32:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-ATOM-NEXT: movl (%ecx), %eax
				; X86-ATOM-NEXT: .p2align 4, 0x90
				; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
				; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
				; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
				; X86-ATOM-NEXT: jne .LBB5_1
				; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
				; X86-ATOM-NEXT: retl
	%1 = atomicrmw and i32* %p, i32 -1 acq_rel			%1 = atomicrmw and i32* %p, i32 -1 acq_rel
	ret i32 %1			ret i32 %1
	}			}

	define void @or32_nouse_monotonic(i32* %p) {			define void @or32_nouse_monotonic(i32* %p) {
	; CHECK-LABEL: or32_nouse_monotonic:			; X64-LABEL: or32_nouse_monotonic:
	; CHECK: # %bb.0:			; X64: # %bb.0:
	; CHECK-NEXT: #MEMBARRIER			; X64-NEXT: #MEMBARRIER
	; CHECK-NEXT: ret{{[l\|q]}}			; X64-NEXT: retq
				;
				; X86-GENERIC-LABEL: or32_nouse_monotonic:
				; X86-GENERIC: # %bb.0:
				; X86-GENERIC-NEXT: #MEMBARRIER
				; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or32_nouse_monotonic:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: #MEMBARRIER
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 monotonic			atomicrmw or i32* %p, i32 0 monotonic
	ret void			ret void
	}			}


	define void @or32_nouse_acquire(i32* %p) {			define void @or32_nouse_acquire(i32* %p) {
	; CHECK-LABEL: or32_nouse_acquire:			; X64-LABEL: or32_nouse_acquire:
	; CHECK: # %bb.0:			; X64: # %bb.0:
	; CHECK-NEXT: #MEMBARRIER			; X64-NEXT: #MEMBARRIER
	; CHECK-NEXT: ret{{[l\|q]}}			; X64-NEXT: retq
				;
				; X86-GENERIC-LABEL: or32_nouse_acquire:
				; X86-GENERIC: # %bb.0:
				; X86-GENERIC-NEXT: #MEMBARRIER
				; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or32_nouse_acquire:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: #MEMBARRIER
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 acquire			atomicrmw or i32* %p, i32 0 acquire
	ret void			ret void
	}			}

	define void @or32_nouse_release(i32* %p) {			define void @or32_nouse_release(i32* %p) {
	; CHECK-LABEL: or32_nouse_release:			; X64-LABEL: or32_nouse_release:
	; CHECK: # %bb.0:			; X64: # %bb.0:
	; CHECK-NEXT: #MEMBARRIER			; X64-NEXT: #MEMBARRIER
	; CHECK-NEXT: ret{{[l\|q]}}			; X64-NEXT: retq
				;
				; X86-GENERIC-LABEL: or32_nouse_release:
				; X86-GENERIC: # %bb.0:
				; X86-GENERIC-NEXT: #MEMBARRIER
				; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or32_nouse_release:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: #MEMBARRIER
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 release			atomicrmw or i32* %p, i32 0 release
	ret void			ret void
	}			}

	define void @or32_nouse_acq_rel(i32* %p) {			define void @or32_nouse_acq_rel(i32* %p) {
	; CHECK-LABEL: or32_nouse_acq_rel:			; X64-LABEL: or32_nouse_acq_rel:
	; CHECK: # %bb.0:			; X64: # %bb.0:
	; CHECK-NEXT: #MEMBARRIER			; X64-NEXT: #MEMBARRIER
	; CHECK-NEXT: ret{{[l\|q]}}			; X64-NEXT: retq
				;
				; X86-GENERIC-LABEL: or32_nouse_acq_rel:
				; X86-GENERIC: # %bb.0:
				; X86-GENERIC-NEXT: #MEMBARRIER
				; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or32_nouse_acq_rel:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: #MEMBARRIER
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 acq_rel			atomicrmw or i32* %p, i32 0 acq_rel
	ret void			ret void
	}			}

	define void @or32_nouse_seq_cst(i32* %p) {			define void @or32_nouse_seq_cst(i32* %p) {
	; X64-LABEL: or32_nouse_seq_cst:			; X64-LABEL: or32_nouse_seq_cst:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)			; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or32_nouse_seq_cst:			; X86-GENERIC-LABEL: or32_nouse_seq_cst:
	; X86: # %bb.0:			; X86-GENERIC: # %bb.0:
	; X86-NEXT: lock orl $0, (%esp)			; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-NEXT: retl			; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or32_nouse_seq_cst:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: lock orl $0, (%esp)
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 seq_cst			atomicrmw or i32* %p, i32 0 seq_cst
	ret void			ret void
	}			}

	; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded			; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
	define void @or64_nouse_seq_cst(i64* %p) {			define void @or64_nouse_seq_cst(i64* %p) {
	; X64-LABEL: or64_nouse_seq_cst:			; X64-LABEL: or64_nouse_seq_cst:
	; X64: # %bb.0:			; X64: # %bb.0:
	Show All 36 Lines
	; X64-NEXT: .cfi_def_cfa_offset 16			; X64-NEXT: .cfi_def_cfa_offset 16
	; X64-NEXT: xorl %esi, %esi			; X64-NEXT: xorl %esi, %esi
	; X64-NEXT: xorl %edx, %edx			; X64-NEXT: xorl %edx, %edx
	; X64-NEXT: callq __sync_fetch_and_or_16			; X64-NEXT: callq __sync_fetch_and_or_16
	; X64-NEXT: popq %rax			; X64-NEXT: popq %rax
	; X64-NEXT: .cfi_def_cfa_offset 8			; X64-NEXT: .cfi_def_cfa_offset 8
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or128_nouse_seq_cst:			; X86-SSE2-LABEL: or128_nouse_seq_cst:
	; X86: # %bb.0:			; X86-SSE2: # %bb.0:
	; X86-NEXT: pushl %ebp			; X86-SSE2-NEXT: pushl %ebp
	; X86-NEXT: .cfi_def_cfa_offset 8			; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: .cfi_offset %ebp, -8			; X86-SSE2-NEXT: .cfi_offset %ebp, -8
	; X86-NEXT: movl %esp, %ebp			; X86-SSE2-NEXT: movl %esp, %ebp
	; X86-NEXT: .cfi_def_cfa_register %ebp			; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
	; X86-NEXT: andl $-8, %esp			; X86-SSE2-NEXT: andl $-8, %esp
	; X86-NEXT: subl $16, %esp			; X86-SSE2-NEXT: subl $16, %esp
	; X86-NEXT: movl %esp, %eax			; X86-SSE2-NEXT: movl %esp, %eax
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl $0			; X86-SSE2-NEXT: pushl $0
	; X86-NEXT: pushl 8(%ebp)			; X86-SSE2-NEXT: pushl 8(%ebp)
	; X86-NEXT: pushl %eax			; X86-SSE2-NEXT: pushl %eax
	; X86-NEXT: calll __sync_fetch_and_or_16			; X86-SSE2-NEXT: calll __sync_fetch_and_or_16
	; X86-NEXT: addl $20, %esp			; X86-SSE2-NEXT: addl $20, %esp
	; X86-NEXT: movl %ebp, %esp			; X86-SSE2-NEXT: movl %ebp, %esp
	; X86-NEXT: popl %ebp			; X86-SSE2-NEXT: popl %ebp
	; X86-NEXT: .cfi_def_cfa %esp, 4			; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
	; X86-NEXT: retl			; X86-SSE2-NEXT: retl
				;
				; X86-SLM-LABEL: or128_nouse_seq_cst:
				; X86-SLM: # %bb.0:
				; X86-SLM-NEXT: pushl %ebp
				; X86-SLM-NEXT: .cfi_def_cfa_offset 8
				; X86-SLM-NEXT: .cfi_offset %ebp, -8
				; X86-SLM-NEXT: movl %esp, %ebp
				; X86-SLM-NEXT: .cfi_def_cfa_register %ebp
				; X86-SLM-NEXT: andl $-8, %esp
				; X86-SLM-NEXT: subl $16, %esp
				; X86-SLM-NEXT: movl 8(%ebp), %eax
				; X86-SLM-NEXT: movl %esp, %ecx
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl $0
				; X86-SLM-NEXT: pushl %eax
				; X86-SLM-NEXT: pushl %ecx
				; X86-SLM-NEXT: calll __sync_fetch_and_or_16
				; X86-SLM-NEXT: addl $20, %esp
				; X86-SLM-NEXT: movl %ebp, %esp
				; X86-SLM-NEXT: popl %ebp
				; X86-SLM-NEXT: .cfi_def_cfa %esp, 4
				; X86-SLM-NEXT: retl
				;
				; X86-ATOM-LABEL: or128_nouse_seq_cst:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: pushl %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa_offset 8
				; X86-ATOM-NEXT: .cfi_offset %ebp, -8
				; X86-ATOM-NEXT: leal (%esp), %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa_register %ebp
				; X86-ATOM-NEXT: andl $-8, %esp
				; X86-ATOM-NEXT: leal -{{[0-9]+}}(%esp), %esp
				; X86-ATOM-NEXT: movl 8(%ebp), %eax
				; X86-ATOM-NEXT: movl %esp, %ecx
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl $0
				; X86-ATOM-NEXT: pushl %eax
				; X86-ATOM-NEXT: pushl %ecx
				; X86-ATOM-NEXT: calll __sync_fetch_and_or_16
				; X86-ATOM-NEXT: leal {{[0-9]+}}(%esp), %esp
				; X86-ATOM-NEXT: movl %ebp, %esp
				; X86-ATOM-NEXT: popl %ebp
				; X86-ATOM-NEXT: .cfi_def_cfa %esp, 4
				; X86-ATOM-NEXT: retl
	atomicrmw or i128* %p, i128 0 seq_cst			atomicrmw or i128* %p, i128 0 seq_cst
	ret void			ret void
	}			}


	define void @or16_nouse_seq_cst(i16* %p) {			define void @or16_nouse_seq_cst(i16* %p) {
	; X64-LABEL: or16_nouse_seq_cst:			; X64-LABEL: or16_nouse_seq_cst:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)			; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or16_nouse_seq_cst:			; X86-GENERIC-LABEL: or16_nouse_seq_cst:
	; X86: # %bb.0:			; X86-GENERIC: # %bb.0:
	; X86-NEXT: lock orl $0, (%esp)			; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-NEXT: retl			; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or16_nouse_seq_cst:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: lock orl $0, (%esp)
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i16* %p, i16 0 seq_cst			atomicrmw or i16* %p, i16 0 seq_cst
	ret void			ret void
	}			}

	define void @or8_nouse_seq_cst(i8* %p) {			define void @or8_nouse_seq_cst(i8* %p) {
	; X64-LABEL: or8_nouse_seq_cst:			; X64-LABEL: or8_nouse_seq_cst:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)			; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: or8_nouse_seq_cst:			; X86-GENERIC-LABEL: or8_nouse_seq_cst:
	; X86: # %bb.0:			; X86-GENERIC: # %bb.0:
	; X86-NEXT: lock orl $0, (%esp)			; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-NEXT: retl			; X86-GENERIC-NEXT: retl
				;
				; X86-ATOM-LABEL: or8_nouse_seq_cst:
				; X86-ATOM: # %bb.0:
				; X86-ATOM-NEXT: lock orl $0, (%esp)
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: nop
				; X86-ATOM-NEXT: retl
	atomicrmw or i8* %p, i8 0 seq_cst			atomicrmw or i8* %p, i8 0 seq_cst
	ret void			ret void
	}			}