Index: llvm/trunk/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetLowering.h +++ llvm/trunk/include/llvm/Target/TargetLowering.h @@ -1008,6 +1008,20 @@ return false; } + /// On some platforms, an AtomicRMW that never actually modifies the value + /// (such as fetch_add of 0) can be turned into a fence followed by an + /// atomic load. This may sound useless, but it makes it possible for the + /// processor to keep the cacheline shared, dramatically improving + /// performance. And such idempotent RMWs are useful for implementing some + /// kinds of locks, see for example (justification + benchmarks): + /// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf + /// This method tries doing that transformation, returning the atomic load if + /// it succeeds, and nullptr otherwise. + /// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo + /// another round of expansion. + virtual LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const { + return nullptr; + } //===--------------------------------------------------------------------===// // TargetLowering Configuration Methods - These methods should be invoked by // the derived class constructor to configure this object for the target. Index: llvm/trunk/lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- llvm/trunk/lib/CodeGen/AtomicExpandPass.cpp +++ llvm/trunk/lib/CodeGen/AtomicExpandPass.cpp @@ -51,6 +51,8 @@ bool expandAtomicRMWToLLSC(AtomicRMWInst *AI); bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); + bool isIdempotentRMW(AtomicRMWInst *AI); + bool simplifyIdempotentRMW(AtomicRMWInst *AI); }; } @@ -127,8 +129,15 @@ MadeChange |= expandAtomicLoad(LI); } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) { MadeChange |= expandAtomicStore(SI); - } else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) { - MadeChange |= expandAtomicRMW(RMWI); + } else if (RMWI) { + // There are two different ways of expanding RMW instructions: + // - into a load if it is idempotent + // - into a Cmpxchg/LL-SC loop otherwise + // we try them in that order. + MadeChange |= (isIdempotentRMW(RMWI) && + simplifyIdempotentRMW(RMWI)) || + (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) && + expandAtomicRMW(RMWI)); } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) { MadeChange |= expandAtomicCmpXchg(CASI); } @@ -520,3 +529,35 @@ CI->eraseFromParent(); return true; } + +bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { + auto C = dyn_cast(RMWI->getValOperand()); + if(!C) + return false; + + AtomicRMWInst::BinOp Op = RMWI->getOperation(); + switch(Op) { + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + return C->isZero(); + case AtomicRMWInst::And: + return C->isMinusOne(); + // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... + default: + return false; + } +} + +bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { + auto TLI = TM->getSubtargetImpl()->getTargetLowering(); + + if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { + if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad)) + expandAtomicLoad(ResultingLoad); + return true; + } + + return false; +} Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -965,6 +965,9 @@ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + LoadInst * + lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + bool needsCmpXchgNb(const Type *MemType) const; /// Utility function to emit atomic-load-arith operations (and, or, xor, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -17802,6 +17802,74 @@ } } +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more agressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) { + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + } else if (hasMFENCE(Subtarget)) { + Function *MFence = llvm::Intrinsic::getDeclaration(M, + Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence); + } else { + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + } + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -17813,10 +17881,7 @@ // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); Index: llvm/trunk/test/CodeGen/X86/atomic_idempotent.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic_idempotent.ll +++ llvm/trunk/test/CodeGen/X86/atomic_idempotent.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +; On x86, an atomic rmw operation that does not modify the value in memory +; (such as atomic add 0) can be replaced by an mfence followed by a mov. +; This is explained (with the motivation for such an optimization) in +; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf + +define i8 @add8(i8* %p) { +; CHECK-LABEL: add8 +; CHECK: mfence +; CHECK: movb + %1 = atomicrmw add i8* %p, i8 0 monotonic + ret i8 %1 +} + +define i16 @or16(i16* %p) { +; CHECK-LABEL: or16 +; CHECK: mfence +; CHECK: movw + %1 = atomicrmw or i16* %p, i16 0 acquire + ret i16 %1 +} + +define i32 @xor32(i32* %p) { +; CHECK-LABEL: xor32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw xor i32* %p, i32 0 release + ret i32 %1 +} + +define i64 @sub64(i64* %p) { +; CHECK-LABEL: sub64 +; X64: mfence +; X64: movq +; X32-NOT: mfence + %1 = atomicrmw sub i64* %p, i64 0 seq_cst + ret i64 %1 +} + +define i128 @or128(i128* %p) { +; CHECK-LABEL: or128 +; CHECK-NOT: mfence + %1 = atomicrmw or i128* %p, i128 0 monotonic + ret i128 %1 +} + +; For 'and', the idempotent value is (-1) +define i32 @and32 (i32* %p) { +; CHECK-LABEL: and32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw and i32* %p, i32 -1 acq_rel + ret i32 %1 +}