Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Instructions.h" @@ -193,6 +194,7 @@ SDNode *Select(SDNode *N) override; SDNode *SelectGather(SDNode *N, unsigned Opc); SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); + SDNode *SelectAtomicAddZero(SDNode *Node, MVT NVT); bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); @@ -1743,6 +1745,80 @@ return Val; } +// On x86 an atomic load-add of the constant 0 can be replaced by an mfence +// followed by a mov. A detailed explanation of this (and exemple of why the +// mfence is required) is available at +// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf +// The general idea is that only a store operation can have release +// semantics so a seqlock (implemented entirely with loads) needs a release +// operation at the end of critical sections, to prevent operations from +// being sunk out of the critical section. Replacing the last load by a +// fetch_add(0, release) accomplishes just that.. but requires this +// optimization to preserve the desirable property of seqlocks that readers +// do not cause cache line bouncing. +// The mfence is required because otherwise the load could be hoisted before +// a preceding store (according to the x86 memory model), which the original +// fetch_add could not do (since only store-load can be reordered in +// load-store) on X86. +SDNode *X86DAGToDAGISel::SelectAtomicAddZero(SDNode *Node, MVT NVT) { + assert(Node->getOpcode() == ISD::ATOMIC_LOAD_ADD); + + SDLoc dl(Node); + + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) + return nullptr; + + auto CN = dyn_cast(Val); + if (!CN) + return nullptr; + + int64_t CNVal = CN->getSExtValue(); + if (CNVal != 0) + return nullptr; + + auto FenceNode = CurDAG->getMachineNode(X86::MFENCE, dl, MVT::Other, Chain); + + unsigned Opc; + switch (NVT.SimpleTy) { + case MVT::i8: + Opc = X86::ACQUIRE_MOV8rm; + break; + case MVT::i16: + Opc = X86::ACQUIRE_MOV16rm; + break; + case MVT::i32: + Opc = X86::ACQUIRE_MOV32rm; + break; + case MVT::i64: + Opc = X86::ACQUIRE_MOV64rm; + break; + default: + llvm_unreachable("Unexpected size for LXADD 0"); + } + + // Note that FenceNode is used for the 'chain' operand, guaranteeing that it + // will be scheduled before the load. + SDValue FenceChain = SDValue(FenceNode, 0); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, FenceChain}; + auto LoadNode = CurDAG->getMachineNode(Opc, dl, NVT, MVT::Other, Ops); + + // We must copy the information about the memory operand, but change the flags + // to remove the mayStore flag. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + auto SMemOp = cast(Node)->getMemOperand(); + auto Flags = SMemOp->getFlags() & ~MachineMemOperand::MOStore; + MemOp[0] = new MachineMemOperand(SMemOp->getPointerInfo(), Flags, + SMemOp->getSize(), SMemOp->getAlignment(), + SMemOp->getAAInfo(), SMemOp->getRanges()); + cast(LoadNode)->setMemRefs(MemOp, MemOp + 1); + + return LoadNode; +} + SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) return nullptr; @@ -2106,13 +2182,14 @@ case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); - + case ISD::ATOMIC_LOAD_ADD: + if (SDNode *RetVal = SelectAtomicAddZero(Node, NVT)) + return RetVal; + /* FALLTHROUGH */ case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); - if (RetVal) + case ISD::ATOMIC_LOAD_OR: { + if (SDNode *RetVal = SelectAtomicLoadArith(Node, NVT)) return RetVal; break; } Index: test/CodeGen/X86/atomic_add_zero.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/atomic_add_zero.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +; On x86, an atomic rmw operation that does not modify the value in memory +; (such as atomic add 0) can be replaced by an mfence followed by a mov. +; This is explained (with the motivation for such an optimisation) in +; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf + +define i8 @add8(i8* %p) { +; CHECK-LABEL: add8 +; CHECK: mfence +; CHECK: movb + %1 = atomicrmw add i8* %p, i8 0 monotonic + ret i8 %1 +} + +define i16 @add16(i16* %p) { +; CHECK-LABEL: add16 +; CHECK: mfence +; CHECK: movw + %1 = atomicrmw add i16* %p, i16 0 monotonic + ret i16 %1 +} + +define i32 @add32(i32* %p) { +; CHECK-LABEL: add32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw add i32* %p, i32 0 monotonic + ret i32 %1 +} + +define i64 @add64(i64* %p) { +; CHECK-LABEL: add64 +; X64: mfence +; X64: movq +; X32-NOT: mfence + %1 = atomicrmw add i64* %p, i64 0 monotonic + ret i64 %1 +} + +define i128 @add128(i128* %p) { +; CHECK-LABEL: add128 +; CHECK-NOT: mfence + %1 = atomicrmw add i128* %p, i128 0 monotonic + ret i128 %1 +}