Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25449,6 +25449,14 @@ if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto C = dyn_cast(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); @@ -25947,6 +25955,65 @@ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) Using a slight offset from the stack pointer avoids creating a false + // read/write dependence on the value at RSP - which is very likely to be a + // spill slot in idiomatic code. We'd be better off still using a larger + // offset, but we need to stay w/in the redzone to ensure the location is + // dereferenceable. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + // NOTE TO REVIEWERS: Do all x86 calling conventions have a redzone? I + // suspect not, but can't find this property to test again. (usesRedZone on + // X86FI is not it) + int SPOffset = -8; + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i64); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::LOCK_OR64mi32, DL, MVT::Other, Ops); + return SDValue(Res, 0); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops); + return SDValue(Res, 0); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; @@ -25981,6 +26048,7 @@ /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -25995,7 +26063,6 @@ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -26005,6 +26072,34 @@ return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && + isa(RHS) && + cast(RHS)->isNullValue()) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); Index: test/CodeGen/X86/atomic-idempotent.ll =================================================================== --- test/CodeGen/X86/atomic-idempotent.ll +++ test/CodeGen/X86/atomic-idempotent.ll @@ -164,3 +164,80 @@ %1 = atomicrmw and i32* %p, i32 -1 acq_rel ret i32 %1 } + +define void @or32_nouse_acquire(i32* %p) { +; CHECK-LABEL: or32_nouse_acquire: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} + atomicrmw or i32* %p, i32 0 acquire + ret void +} + +define void @or32_nouse_release(i32* %p) { +; CHECK-LABEL: or32_nouse_release: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} + atomicrmw or i32* %p, i32 0 release + ret void +} + +define void @or32_nouse_acq_rel(i32* %p) { +; CHECK-LABEL: or32_nouse_acq_rel: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} + atomicrmw or i32* %p, i32 0 acq_rel + ret void +} + +define void @or32_nouse_seq_cst(i32* %p) { +; X64-LABEL: or32_nouse_seq_cst: +; X64: # %bb.0: +; X64-NEXT: lock orq $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq +; +; X32-LABEL: or32_nouse_seq_cst: +; X32: # %bb.0: +; X32-NEXT: lock orl $0, -{{[0-9]+}}(%esp) +; X32-NEXT: retl + atomicrmw or i32* %p, i32 0 seq_cst + ret void +} + +; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded +define void @or64_nouse_seq_cst(i64* %p) { +; X64-LABEL: or64_nouse_seq_cst: +; X64: # %bb.0: +; X64-NEXT: lock orq $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq +; +; X32-LABEL: or64_nouse_seq_cst: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB10_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB10_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + atomicrmw or i64* %p, i64 0 seq_cst + ret void +} +