Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25721,6 +25721,14 @@ if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto *C = dyn_cast(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); @@ -26217,6 +26225,59 @@ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) For the moment, we are using top of stack. This creates false sharing + // with actual stack access/call sequences, and it would be better to use a + // location within the redzone. For the moment, this is still better than an + // mfence though. TODO: Revise the offset used when we can assume a redzone. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops); + return SDValue(Res, 0); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops); + return SDValue(Res, 0); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; @@ -26251,6 +26312,7 @@ /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -26265,7 +26327,6 @@ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -26275,6 +26336,32 @@ return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); Index: test/CodeGen/X86/atomic-idempotent.ll =================================================================== --- test/CodeGen/X86/atomic-idempotent.ll +++ test/CodeGen/X86/atomic-idempotent.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 -; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-linux- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-linux- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 ; On x86, an atomic rmw operation that does not modify the value in memory ; (such as atomic add 0) can be replaced by an mfence followed by a mov. @@ -107,27 +107,34 @@ ; ; X32-LABEL: or128: ; X32: # %bb.0: -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp ; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: .cfi_offset %esi, -16 -; X32-NEXT: .cfi_offset %edi, -12 -; X32-NEXT: movl 8(%ebp), %esi -; X32-NEXT: movl %esp, %eax +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $20, %esp +; X32-NEXT: .cfi_def_cfa_offset 32 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_adjust_cfa_offset 8 +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 -; X32-NEXT: pushl 12(%ebp) +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: calll __sync_fetch_and_or_16 -; X32-NEXT: addl $20, %esp +; X32-NEXT: .cfi_adjust_cfa_offset -4 +; X32-NEXT: addl $28, %esp +; X32-NEXT: .cfi_adjust_cfa_offset -28 ; X32-NEXT: movl (%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -137,11 +144,12 @@ ; X32-NEXT: movl %eax, (%esi) ; X32-NEXT: movl %ecx, 4(%esi) ; X32-NEXT: movl %esi, %eax -; X32-NEXT: leal -8(%ebp), %esp +; X32-NEXT: addl $20, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %edi -; X32-NEXT: popl %ebp -; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl $4 %1 = atomicrmw or i128* %p, i128 0 monotonic ret i128 %1 @@ -166,70 +174,38 @@ } define void @or32_nouse_monotonic(i32* %p) { -; X64-LABEL: or32_nouse_monotonic: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_monotonic: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 monotonic ret void } define void @or32_nouse_acquire(i32* %p) { -; X64-LABEL: or32_nouse_acquire: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_acquire: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_acquire: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acquire ret void } define void @or32_nouse_release(i32* %p) { -; X64-LABEL: or32_nouse_release: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_release: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_release: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 release ret void } define void @or32_nouse_acq_rel(i32* %p) { -; X64-LABEL: or32_nouse_acq_rel: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_acq_rel: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_acq_rel: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acq_rel ret void } @@ -237,15 +213,12 @@ define void @or32_nouse_seq_cst(i32* %p) { ; X64-LABEL: or32_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or32_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i32* %p, i32 0 seq_cst ret void @@ -255,8 +228,7 @@ define void @or64_nouse_seq_cst(i64* %p) { ; X64-LABEL: or64_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or64_nouse_seq_cst: @@ -302,25 +274,25 @@ ; ; X32-LABEL: or128_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: movl %esp, %eax +; X32-NEXT: subl $36, %esp +; X32-NEXT: .cfi_adjust_cfa_offset 36 +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl $0 -; X32-NEXT: pushl 8(%ebp) +; X32-NEXT: .cfi_adjust_cfa_offset 4 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_adjust_cfa_offset 4 ; X32-NEXT: calll __sync_fetch_and_or_16 -; X32-NEXT: addl $20, %esp -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: .cfi_adjust_cfa_offset -4 +; X32-NEXT: addl $56, %esp +; X32-NEXT: .cfi_adjust_cfa_offset -56 ; X32-NEXT: retl ; X128-LABEL: or128_nouse_seq_cst: ; X128: # %bb.0: @@ -334,15 +306,12 @@ define void @or16_nouse_seq_cst(i16* %p) { ; X64-LABEL: or16_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or16_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movzwl (%eax), %eax +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i16* %p, i16 0 seq_cst ret void @@ -351,15 +320,12 @@ define void @or8_nouse_seq_cst(i8* %p) { ; X64-LABEL: or8_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movb (%rdi), %al +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or8_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movb (%eax), %al +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i8* %p, i8 0 seq_cst ret void