Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25727,6 +25727,14 @@ if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto *C = dyn_cast(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); @@ -26223,6 +26231,59 @@ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) For the moment, we are using top of stack. This creates false sharing + // with actual stack access/call sequences, and it would be better to use a + // location within the redzone. For the moment, this is still better than an + // mfence though. TODO: Revise the offset used when we can assume a redzone. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops); + return SDValue(Res, 0); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops); + return SDValue(Res, 0); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; @@ -26257,6 +26318,7 @@ /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -26271,7 +26333,6 @@ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -26281,6 +26342,32 @@ return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain); + return SDValue(); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); Index: llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll +++ llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll @@ -166,70 +166,38 @@ } define void @or32_nouse_monotonic(i32* %p) { -; X64-LABEL: or32_nouse_monotonic: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_monotonic: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 monotonic ret void } define void @or32_nouse_acquire(i32* %p) { -; X64-LABEL: or32_nouse_acquire: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_acquire: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_acquire: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acquire ret void } define void @or32_nouse_release(i32* %p) { -; X64-LABEL: or32_nouse_release: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_release: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_release: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 release ret void } define void @or32_nouse_acq_rel(i32* %p) { -; X64-LABEL: or32_nouse_acq_rel: -; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: retq -; -; X32-LABEL: or32_nouse_acq_rel: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax -; X32-NEXT: retl +; CHECK-LABEL: or32_nouse_acq_rel: +; CHECK: # %bb.0: +; CHECK-NEXT: #MEMBARRIER +; CHECK-NEXT: ret{{[l|q]}} atomicrmw or i32* %p, i32 0 acq_rel ret void } @@ -237,15 +205,12 @@ define void @or32_nouse_seq_cst(i32* %p) { ; X64-LABEL: or32_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or32_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movl (%eax), %eax +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i32* %p, i32 0 seq_cst ret void @@ -255,8 +220,7 @@ define void @or64_nouse_seq_cst(i64* %p) { ; X64-LABEL: or64_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or64_nouse_seq_cst: @@ -334,15 +298,12 @@ define void @or16_nouse_seq_cst(i16* %p) { ; X64-LABEL: or16_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or16_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movzwl (%eax), %eax +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i16* %p, i16 0 seq_cst ret void @@ -351,15 +312,12 @@ define void @or8_nouse_seq_cst(i8* %p) { ; X64-LABEL: or8_nouse_seq_cst: ; X64: # %bb.0: -; X64-NEXT: mfence -; X64-NEXT: movb (%rdi), %al +; X64-NEXT: lock orl $0, (%rsp) ; X64-NEXT: retq ; ; X32-LABEL: or8_nouse_seq_cst: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mfence -; X32-NEXT: movb (%eax), %al +; X32-NEXT: lock orl $0, (%esp) ; X32-NEXT: retl atomicrmw or i8* %p, i8 0 seq_cst ret void