Index: include/llvm/IR/Instruction.h =================================================================== --- include/llvm/IR/Instruction.h +++ include/llvm/IR/Instruction.h @@ -338,6 +338,11 @@ return mayReadFromMemory() || mayWriteToMemory(); } + /// isAtomic - Return true if this instruction has an + /// AtomicOrdering of unordered or higher. + /// + bool isAtomic() const; + /// mayThrow - Return true if this instruction may throw an exception. /// bool mayThrow() const; Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -241,7 +241,6 @@ (xthread << 6)); } - bool isAtomic() const { return getOrdering() != NotAtomic; } void setAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread) { setOrdering(Ordering); @@ -361,7 +360,6 @@ (xthread << 6)); } - bool isAtomic() const { return getOrdering() != NotAtomic; } void setAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread) { setOrdering(Ordering); Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -31,6 +31,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Target/TargetCallingConv.h" @@ -935,6 +936,10 @@ /// \name Helpers for atomic expansion. /// @{ + /// True if AtomicExpandPass should use emitLoadLinked/emitStoreConditional + /// and expand AtomicCmpXchgInst. + virtual bool hasLoadLinkedStoreConditional() const { return false; } + /// Perform a load-linked operation on Addr, returning a "Value *" with the /// corresponding pointee type. This may entail some non-trivial operations to /// truncate or reconstruct types that will be illegal in the backend. See @@ -955,7 +960,7 @@ /// It is called by AtomicExpandPass before expanding an /// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad. /// RMW and CmpXchg set both IsStore and IsLoad to true. - /// Backends with !getInsertFencesForAtomic() should keep a no-op here + /// Backends with !getInsertFencesForAtomic() should keep a no-op here. virtual void emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { assert(!getInsertFencesForAtomic()); @@ -965,20 +970,28 @@ /// It is called by AtomicExpandPass after expanding an /// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad. /// RMW and CmpXchg set both IsStore and IsLoad to true. - /// Backends with !getInsertFencesForAtomic() should keep a no-op here + /// Backends with !getInsertFencesForAtomic() should keep a no-op here. virtual void emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { assert(!getInsertFencesForAtomic()); } - /// Return true if the given (atomic) instruction should be expanded by the - /// IR-level AtomicExpand pass into a loop involving - /// load-linked/store-conditional pairs. Atomic stores will be expanded in the - /// same way as "atomic xchg" operations which ignore their output if needed. - virtual bool shouldExpandAtomicInIR(Instruction *Inst) const { + /// Returns true if the given (atomic) store should be expanded by the + /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input. + virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const { return false; } + /// Returns true if the given (atomic) load should be expanded by the + /// IR-level AtomicExpand pass into a load-linked instruction + /// (through emitLoadLinked()). + virtual bool shouldExpandAtomicLoadInIR(LoadInst *LI) const { return false; } + + /// Returns true if the given AtomicRMW should be expanded by the + /// IR-level AtomicExpand pass into a loop using LoadLinked/StoreConditional. + virtual bool shouldExpandAtomicRMWInIR(AtomicRMWInst *RMWI) const { + return false; + } //===--------------------------------------------------------------------===// // TargetLowering Configuration Methods - These methods should be invoked by Index: lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- lib/CodeGen/AtomicExpandPass.cpp +++ lib/CodeGen/AtomicExpandPass.cpp @@ -8,13 +8,14 @@ //===----------------------------------------------------------------------===// // // This file contains a pass (at IR level) to replace atomic instructions with -// appropriate (intrinsic-based) ldrex/strex loops. +// either (intrinsic-based) ldrex/strex loops or AtomicCmpXchg. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" @@ -38,15 +39,14 @@ } bool runOnFunction(Function &F) override; - bool expandAtomicInsts(Function &F); + private: bool expandAtomicLoad(LoadInst *LI); - bool expandAtomicStore(StoreInst *LI); + bool expandAtomicStore(StoreInst *SI); bool expandAtomicRMW(AtomicRMWInst *AI); + bool expandAtomicRMWToLLSC(AtomicRMWInst *AI); + bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); - - AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord); - void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord); }; } @@ -63,55 +63,64 @@ bool AtomicExpand::runOnFunction(Function &F) { if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand()) return false; + auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering(); SmallVector AtomicInsts; // Changing control-flow while iterating through it is a bad idea, so gather a // list of all atomic instructions before we start. - for (BasicBlock &BB : F) - for (Instruction &Inst : BB) { - if (isa(&Inst) || isa(&Inst) || - (isa(&Inst) && cast(&Inst)->isAtomic()) || - (isa(&Inst) && cast(&Inst)->isAtomic())) - AtomicInsts.push_back(&Inst); - } + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { + if (I->isAtomic()) + AtomicInsts.push_back(&*I); + } bool MadeChange = false; - for (Instruction *Inst : AtomicInsts) { - if (!TM->getSubtargetImpl()->getTargetLowering()->shouldExpandAtomicInIR( - Inst)) - continue; + for (auto I : AtomicInsts) { + auto LI = dyn_cast(I); + auto SI = dyn_cast(I); + auto RMWI = dyn_cast(I); + auto CASI = dyn_cast(I); + + assert((LI || SI || RMWI || CASI || isa(I)) && + "Unknown atomic instruction"); - if (AtomicRMWInst *AI = dyn_cast(Inst)) - MadeChange |= expandAtomicRMW(AI); - else if (AtomicCmpXchgInst *CI = dyn_cast(Inst)) - MadeChange |= expandAtomicCmpXchg(CI); - else if (LoadInst *LI = dyn_cast(Inst)) + if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) { MadeChange |= expandAtomicLoad(LI); - else if (StoreInst *SI = dyn_cast(Inst)) + } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) { MadeChange |= expandAtomicStore(SI); - else - llvm_unreachable("Unknown atomic instruction"); + } else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) { + MadeChange |= expandAtomicRMW(RMWI); + } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) { + MadeChange |= expandAtomicCmpXchg(CASI); + } } - return MadeChange; } bool AtomicExpand::expandAtomicLoad(LoadInst *LI) { - // Load instructions don't actually need a leading fence, even in the - // SequentiallyConsistent case. + auto TLI = TM->getSubtargetImpl()->getTargetLowering(); + // If getInsertFencesForAtomic() returns true, then the target does not want + // to deal with memory orders, and emitLeading/TrailingFence should take care + // of everything. Otherwise, emitLeading/TrailingFence are no-op and we + // should preserve the ordering. AtomicOrdering MemOpOrder = - TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic() - ? Monotonic - : LI->getOrdering(); + TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering(); + IRBuilder<> Builder(LI); - // The only 64-bit load guaranteed to be single-copy atomic by the ARM is + // Note that although no fence is required before atomic load on ARM, it is + // required before SequentiallyConsistent loads for the recommended Power + // mapping (see http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html). + // So we let the target choose what to emit. + TLI->emitLeadingFence(Builder, LI->getOrdering(), + /*IsStore=*/false, /*IsLoad=*/true); + + // The only 64-bit load guaranteed to be single-copy atomic by ARM is // an ldrexd (A3.5.3). - IRBuilder<> Builder(LI); - Value *Val = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked( - Builder, LI->getPointerOperand(), MemOpOrder); + Value *Val = + TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder); - insertTrailingFence(Builder, LI->getOrdering()); + TLI->emitTrailingFence(Builder, LI->getOrdering(), + /*IsStore=*/false, /*IsLoad=*/true); LI->replaceAllUsesWith(Val); LI->eraseFromParent(); @@ -120,9 +129,12 @@ } bool AtomicExpand::expandAtomicStore(StoreInst *SI) { - // The only atomic 64-bit store on ARM is an strexd that succeeds, which means - // we need a loop and the entire instruction is essentially an "atomicrmw - // xchg" that ignores the value loaded. + // This function is only called on atomic stores that are too large to be + // atomic if implemented as a native store. So we replace them by an + // atomic swap, that can be implemented for example as a ldrex/strex on ARM + // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. + // It is the responsibility of the target to only return true in + // shouldExpandAtomicRMW in cases where this is required and possible. IRBuilder<> Builder(SI); AtomicRMWInst *AI = Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), @@ -134,11 +146,64 @@ } bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) { - AtomicOrdering Order = AI->getOrdering(); + if (TM->getSubtargetImpl() + ->getTargetLowering() + ->hasLoadLinkedStoreConditional()) + return expandAtomicRMWToLLSC(AI); + else + return expandAtomicRMWToCmpXchg(AI); +} + +/// Emit IR to implement the given atomicrmw operation on values in registers, +/// returning the new value. +static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, + Value *Loaded, Value *Inc) { + Value *NewVal; + switch (Op) { + case AtomicRMWInst::Xchg: + return Inc; + case AtomicRMWInst::Add: + return Builder.CreateAdd(Loaded, Inc, "new"); + case AtomicRMWInst::Sub: + return Builder.CreateSub(Loaded, Inc, "new"); + case AtomicRMWInst::And: + return Builder.CreateAnd(Loaded, Inc, "new"); + case AtomicRMWInst::Nand: + return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); + case AtomicRMWInst::Or: + return Builder.CreateOr(Loaded, Inc, "new"); + case AtomicRMWInst::Xor: + return Builder.CreateXor(Loaded, Inc, "new"); + case AtomicRMWInst::Max: + NewVal = Builder.CreateICmpSGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::Min: + NewVal = Builder.CreateICmpSLE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMax: + NewVal = Builder.CreateICmpUGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMin: + NewVal = Builder.CreateICmpULE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + default: + llvm_unreachable("Unknown atomic op"); + } +} + +bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { + auto TLI = TM->getSubtargetImpl()->getTargetLowering(); + AtomicOrdering FenceOrder = AI->getOrdering(); Value *Addr = AI->getPointerOperand(); BasicBlock *BB = AI->getParent(); Function *F = BB->getParent(); LLVMContext &Ctx = F->getContext(); + // If getInsertFencesForAtomic() returns true, then the target does not want + // to deal with memory orders, and emitLeading/TrailingFence should take care + // of everything. Otherwise, emitLeading/TrailingFence are no-op and we + // should preserve the ordering. + AtomicOrdering MemOpOrder = + TLI->getInsertFencesForAtomic() ? Monotonic : FenceOrder; // Given: atomicrmw some_op iN* %addr, iN %incr ordering // @@ -165,67 +230,24 @@ // the branch entirely. std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order); + TLI->emitLeadingFence(Builder, FenceOrder, /*IsStore=*/true, /*IsLoad=*/true); Builder.CreateBr(LoopBB); // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); - Value *Loaded = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked( - Builder, Addr, MemOpOrder); + Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); - Value *NewVal; - switch (AI->getOperation()) { - case AtomicRMWInst::Xchg: - NewVal = AI->getValOperand(); - break; - case AtomicRMWInst::Add: - NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Sub: - NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::And: - NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Nand: - NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()), - "new"); - break; - case AtomicRMWInst::Or: - NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Xor: - NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Max: - NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Min: - NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::UMax: - NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::UMin: - NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - default: - llvm_unreachable("Unknown atomic op"); - } + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); Value *StoreSuccess = - TM->getSubtargetImpl()->getTargetLowering()->emitStoreConditional( - Builder, NewVal, Addr, MemOpOrder); + TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - insertTrailingFence(Builder, Order); + TLI->emitTrailingFence(Builder, FenceOrder, /*IsStore=*/true, /*IsLoad=*/true); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -233,13 +255,90 @@ return true; } +bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) { + auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering(); + AtomicOrdering FenceOrder = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + AtomicOrdering MemOpOrder = + TargetLowering->getInsertFencesForAtomic() ? Monotonic : FenceOrder; + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + TargetLowering->emitLeadingFence(Builder, FenceOrder, + /*IsStore=*/true, /*IsLoad=*/true); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, MemOpOrder, + AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); + Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + Loaded->addIncoming(NewLoaded, LoopBB); + + Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + TargetLowering->emitTrailingFence(Builder, FenceOrder, + /*IsStore=*/true, /*IsLoad=*/true); + + AI->replaceAllUsesWith(NewLoaded); + AI->eraseFromParent(); + + return true; +} + bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { + auto TLI = TM->getSubtargetImpl()->getTargetLowering(); AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); AtomicOrdering FailureOrder = CI->getFailureOrdering(); Value *Addr = CI->getPointerOperand(); BasicBlock *BB = CI->getParent(); Function *F = BB->getParent(); LLVMContext &Ctx = F->getContext(); + // If getInsertFencesForAtomic() returns true, then the target does not want + // to deal with memory orders, and emitLeading/TrailingFence should take care + // of everything. Otherwise, emitLeading/TrailingFence are no-op and we + // should preserve the ordering. + AtomicOrdering MemOpOrder = + TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder; // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord // @@ -280,13 +379,13 @@ // the branch entirely. std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder); + TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, + /*IsLoad=*/true); Builder.CreateBr(LoopBB); // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); - Value *Loaded = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked( - Builder, Addr, MemOpOrder); + Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); Value *ShouldStore = Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store"); @@ -295,9 +394,8 @@ Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB); Builder.SetInsertPoint(TryStoreBB); - Value *StoreSuccess = - TM->getSubtargetImpl()->getTargetLowering()->emitStoreConditional( - Builder, CI->getNewValOperand(), Addr, MemOpOrder); + Value *StoreSuccess = TLI->emitStoreConditional( + Builder, CI->getNewValOperand(), Addr, MemOpOrder); StoreSuccess = Builder.CreateICmpEQ( StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); Builder.CreateCondBr(StoreSuccess, SuccessBB, @@ -305,11 +403,13 @@ // Make sure later instructions don't get reordered with a fence if necessary. Builder.SetInsertPoint(SuccessBB); - insertTrailingFence(Builder, SuccessOrder); + TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, + /*IsLoad=*/true); Builder.CreateBr(ExitBB); Builder.SetInsertPoint(FailureBB); - insertTrailingFence(Builder, FailureOrder); + TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, + /*IsLoad=*/true); Builder.CreateBr(ExitBB); // Finally, we have control-flow based knowledge of whether the cmpxchg @@ -358,27 +458,3 @@ CI->eraseFromParent(); return true; } - -AtomicOrdering AtomicExpand::insertLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord) { - if (!TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic()) - return Ord; - - if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) - Builder.CreateFence(Release); - - // The exclusive operations don't need any barrier if we're adding separate - // fences. - return Monotonic; -} - -void AtomicExpand::insertTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord) { - if (!TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic()) - return; - - if (Ord == Acquire || Ord == AcquireRelease) - Builder.CreateFence(Acquire); - else if (Ord == SequentiallyConsistent) - Builder.CreateFence(SequentiallyConsistent); -} Index: lib/IR/Instruction.cpp =================================================================== --- lib/IR/Instruction.cpp +++ lib/IR/Instruction.cpp @@ -443,6 +443,21 @@ } } +bool Instruction::isAtomic() const { + switch (getOpcode()) { + default: + return false; + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + case Instruction::Fence: + return true; + case Instruction::Load: + return cast(this)->getOrdering() != NotAtomic; + case Instruction::Store: + return cast(this)->getOrdering() != NotAtomic; + } +} + bool Instruction::mayThrow() const { if (const CallInst *CI = dyn_cast(this)) return !CI->doesNotThrow(); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -317,12 +317,15 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicInIR(Instruction *Inst) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8513,19 +8513,6 @@ } } -bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { - // Loads and stores less than 128-bits are already atomic; ones above that - // are doomed anyway, so defer to the default libcall and blame the OS when - // things go wrong: - if (StoreInst *SI = dyn_cast(Inst)) - return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128; - else if (LoadInst *LI = dyn_cast(Inst)) - return LI->getType()->getPrimitiveSizeInBits() == 128; - - // For the real atomic operations, we have ldxr/stxr up to 128 bits. - return Inst->getType()->getPrimitiveSizeInBits() <= 128; -} - bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } @@ -8542,6 +8529,32 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return Size == 128; +} + +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + unsigned Size = LI->getType()->getPrimitiveSizeInBits(); + return Size == 128; +} + +// For the real atomic operations, we have ldxr/stxr up to 128 bits, +bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + return Size <= 128; +} + +bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { + return true; +} + Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -392,12 +392,20 @@ bool functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicInIR(Instruction *Inst) const override; + void emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + void emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + + bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool useLoadStackGuardNode() const override; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -2723,7 +2723,7 @@ ConstantSDNode *OrdN = cast(Op.getOperand(1)); AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); - unsigned Domain = ARM_MB::ISH; + ARM_MB::MemBOpt Domain = ARM_MB::ISH; if (Subtarget->isMClass()) { // Only a full system barrier exists in the M-class architectures. Domain = ARM_MB::SY; @@ -10982,23 +10982,88 @@ return true; } -bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { - // Loads and stores less than 64-bits are already atomic; ones above that - // are doomed anyway, so defer to the default libcall and blame the OS when - // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit - // anything for those. - bool IsMClass = Subtarget->isMClass(); - if (StoreInst *SI = dyn_cast(Inst)) { - unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - return Size == 64 && !IsMClass; - } else if (LoadInst *LI = dyn_cast(Inst)) { - return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass; +bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; } + +static void makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); + Constant *CDomain = Builder.getInt32(Domain); + Builder.CreateCall(DMB, CDomain); +} + +// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +void ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (!getInsertFencesForAtomic()) + return; + + switch (Ord) { + case NotAtomic: + case Unordered: + llvm_unreachable("Invalid fence: unordered/non-atomic"); + case Monotonic: + case Acquire: + return; // Nothing to do + case SequentiallyConsistent: + if (!IsStore) + return; // Nothing to do + /*FALLTHROUGH*/ + case Release: + case AcquireRelease: + if (Subtarget->isSwift()) + makeDMB(Builder, ARM_MB::ISHST); + // FIXME: add a comment with a link to documentation justifying this. + else + makeDMB(Builder, ARM_MB::ISH); + return; } +} + +void ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (!getInsertFencesForAtomic()) + return; + + switch (Ord) { + case NotAtomic: + case Unordered: + llvm_unreachable("Invalid fence: unordered/not-atomic"); + case Monotonic: + case Release: + return; // Nothing to do + case Acquire: + case AcquireRelease: + case SequentiallyConsistent: + makeDMB(Builder, ARM_MB::ISH); + return; + } +} + +// Loads and stores less than 64-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit +// anything for those. +bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return (Size == 64) && !Subtarget->isMClass(); +} + +// Loads and stores less than 64-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit +// anything for those. +bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + unsigned Size = LI->getType()->getPrimitiveSizeInBits(); + return (Size == 64) && !Subtarget->isMClass(); +} - // For the real atomic operations, we have ldrex/strex up to 32 bits, - // and up to 64 bits on the non-M profiles - unsigned AtomicLimit = IsMClass ? 32 : 64; - return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit; +// For the real atomic operations, we have ldrex/strex up to 32 bits, +// and up to 64 bits on the non-M profiles +bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + return Size <= (Subtarget->isMClass() ? 32 : 64); } // This has so far only been implemented for MachO. Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -14,7 +14,6 @@ set(sources X86AsmPrinter.cpp - X86AtomicExpandPass.cpp X86FastISel.cpp X86FloatingPoint.cpp X86FrameLowering.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -23,10 +23,6 @@ class ImmutablePass; class X86TargetMachine; -/// createX86AtomicExpandPass - This pass expands atomic operations that cannot -/// be handled natively in terms of a loop using cmpxchg. -FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); - /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// Index: lib/Target/X86/X86AtomicExpandPass.cpp =================================================================== --- lib/Target/X86/X86AtomicExpandPass.cpp +++ /dev/null @@ -1,283 +0,0 @@ -//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass (at IR level) to replace atomic instructions which -// cannot be implemented as a single instruction with cmpxchg-based loops. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86TargetMachine.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "x86-atomic-expand" - -namespace { - class X86AtomicExpandPass : public FunctionPass { - const X86TargetMachine *TM; - public: - static char ID; // Pass identification, replacement for typeid - explicit X86AtomicExpandPass(const X86TargetMachine *TM) - : FunctionPass(ID), TM(TM) {} - - bool runOnFunction(Function &F) override; - bool expandAtomicInsts(Function &F); - - bool needsCmpXchgNb(Type *MemType); - - /// There are four kinds of atomic operations. Two never need expanding: - /// cmpxchg is what we expand the others *to*, and loads are easily handled - /// by ISelLowering. Atomicrmw and store can need expanding in some - /// circumstances. - bool shouldExpand(Instruction *Inst); - - /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms - /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. - bool shouldExpandStore(StoreInst *SI); - - /// Only some atomicrmw instructions need expanding -- some operations - /// (e.g. max) have absolutely no architectural support; some (e.g. or) have - /// limited support but can't return the previous value; some (e.g. add) - /// have complete support in the instruction set. - /// - /// Also, naturally, 128-bit operations always need to be expanded. - bool shouldExpandAtomicRMW(AtomicRMWInst *AI); - - bool expandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicStore(StoreInst *SI); - }; -} - -char X86AtomicExpandPass::ID = 0; - -FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { - return new X86AtomicExpandPass(TM); -} - -bool X86AtomicExpandPass::runOnFunction(Function &F) { - SmallVector AtomicInsts; - - // Changing control-flow while iterating through it is a bad idea, so gather a - // list of all atomic instructions before we start. - for (BasicBlock &BB : F) - for (Instruction &Inst : BB) { - if (isa(&Inst) || - (isa(&Inst) && cast(&Inst)->isAtomic())) - AtomicInsts.push_back(&Inst); - } - - bool MadeChange = false; - for (Instruction *Inst : AtomicInsts) { - if (!shouldExpand(Inst)) - continue; - - if (AtomicRMWInst *AI = dyn_cast(Inst)) - MadeChange |= expandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast(Inst)) - MadeChange |= expandAtomicStore(SI); - - assert(MadeChange && "Atomic inst not expanded when it should be?"); - Inst->eraseFromParent(); - } - - return MadeChange; -} - -/// Returns true if the operand type is 1 step up from the native width, and -/// the corresponding cmpxchg8b or cmpxchg16b instruction is available -/// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { - const X86Subtarget &Subtarget = TM->getSubtarget(); - unsigned OpWidth = MemType->getPrimitiveSizeInBits(); - - if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b - if (OpWidth == 128) - return Subtarget.hasCmpxchg16b(); - - return false; -} - -bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { - const X86Subtarget &Subtarget = TM->getSubtarget(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; - - if (needsCmpXchgNb(AI->getType())) - return true; - - if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) - return false; - - AtomicRMWInst::BinOp Op = AI->getOperation(); - switch (Op) { - default: - llvm_unreachable("Unknown atomic operation"); - case AtomicRMWInst::Xchg: - case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: - // It's better to use xadd, xsub or xchg for these in all cases. - return false; - case AtomicRMWInst::Or: - case AtomicRMWInst::And: - case AtomicRMWInst::Xor: - // If the atomicrmw's result isn't actually used, we can just add a "lock" - // prefix to a normal instruction for these operations. - return !AI->use_empty(); - case AtomicRMWInst::Nand: - case AtomicRMWInst::Max: - case AtomicRMWInst::Min: - case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: - // These always require a non-trivial set of data operations on x86. We must - // use a cmpxchg loop. - return true; - } -} - -bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { - if (needsCmpXchgNb(SI->getValueOperand()->getType())) - return true; - - return false; -} - -bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { - if (AtomicRMWInst *AI = dyn_cast(Inst)) - return shouldExpandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast(Inst)) - return shouldExpandStore(SI); - return false; -} - -/// Emit IR to implement the given atomicrmw operation on values in registers, -/// returning the new value. -static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, - Value *Loaded, Value *Inc) { - Value *NewVal; - switch (Op) { - case AtomicRMWInst::Xchg: - return Inc; - case AtomicRMWInst::Add: - return Builder.CreateAdd(Loaded, Inc, "new"); - case AtomicRMWInst::Sub: - return Builder.CreateSub(Loaded, Inc, "new"); - case AtomicRMWInst::And: - return Builder.CreateAnd(Loaded, Inc, "new"); - case AtomicRMWInst::Nand: - return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); - case AtomicRMWInst::Or: - return Builder.CreateOr(Loaded, Inc, "new"); - case AtomicRMWInst::Xor: - return Builder.CreateXor(Loaded, Inc, "new"); - case AtomicRMWInst::Max: - NewVal = Builder.CreateICmpSGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::Min: - NewVal = Builder.CreateICmpSLE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMax: - NewVal = Builder.CreateICmpUGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMin: - NewVal = Builder.CreateICmpULE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - default: - break; - } - llvm_unreachable("Unknown atomic op"); -} - -bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { - AtomicOrdering Order = - AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); - Function *F = BB->getParent(); - LLVMContext &Ctx = F->getContext(); - - // Given: atomicrmw some_op iN* %addr, iN %incr ordering - // - // The standard expansion we produce is: - // [...] - // %init_loaded = load atomic iN* %addr - // br label %loop - // loop: - // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] - // %new = some_op iN %loaded, %incr - // %pair = cmpxchg iN* %addr, iN %loaded, iN %new - // %new_loaded = extractvalue { iN, i1 } %pair, 0 - // %success = extractvalue { iN, i1 } %pair, 1 - // br i1 %success, label %atomicrmw.end, label %loop - // atomicrmw.end: - // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); - BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); - - // The split call above "helpfully" added a branch at the end of BB (to the - // wrong place), but we want a load. It's easiest to just remove - // the branch entirely. - std::prev(BB->end())->eraseFromParent(); - Builder.SetInsertPoint(BB); - LoadInst *InitLoaded = Builder.CreateLoad(Addr); - InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); - Builder.CreateBr(LoopBB); - - // Start the main loop block now that we've taken care of the preliminaries. - Builder.SetInsertPoint(LoopBB); - PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); - Loaded->addIncoming(InitLoaded, BB); - - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); - - Value *Pair = Builder.CreateAtomicCmpXchg( - Addr, Loaded, NewVal, Order, - AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); - Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); - Loaded->addIncoming(NewLoaded, LoopBB); - - Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); - Builder.CreateCondBr(Success, ExitBB, LoopBB); - - AI->replaceAllUsesWith(NewLoaded); - - return true; -} - -bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { - // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express - // this in terms of the usual expansion to "atomicrmw xchg". - IRBuilder<> Builder(SI); - AtomicOrdering Order = - SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); - AtomicRMWInst *AI = - Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), - SI->getValueOperand(), Order); - - // Now we have an appropriate swap instruction, lower it as usual. - if (shouldExpandAtomicRMW(AI)) { - expandAtomicRMW(AI); - AI->eraseFromParent(); - return true; - } - - return AI; -} Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -960,6 +960,12 @@ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + bool needsCmpXchgNb(const Type *MemType) const; + /// Utility function to emit atomic-load-arith operations (and, or, xor, /// nand, max, min, umax, umin). It takes the corresponding instruction to /// expand, the associated machine basic block, and the associated X86 Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16612,6 +16612,68 @@ } } +/// Returns true if the operand type is 1 step up from the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available. +/// Used to know whether to use cmpxchg8/16b when expanding atomic operations +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget(); + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + else if (OpWidth == 128) + return Subtarget.hasCmpxchg16b(); + else + return false; +} + +bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return needsCmpXchgNb(SI->getValueOperand()->getType()); +} + +bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const { + return false; // FIXME, currently these are expanded separately in this file. +} + +bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + + // If the operand is too big, we must see if cmpxchg8/16b is available + // and default to library calls otherwise. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return needsCmpXchgNb(MemType); + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -17131,7 +17193,7 @@ case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: // Delegate to generic TypeLegalization. Situations we can really handle - // should have already been dealt with by X86AtomicExpandPass.cpp. + // should have already been dealt with by AtomicExpandPass.cpp. break; case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -105,7 +105,7 @@ } void X86PassConfig::addIRPasses() { - addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + addPass(createAtomicExpandPass(&getX86TargetMachine())); TargetPassConfig::addIRPasses(); } Index: test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll =================================================================== --- test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll +++ test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll @@ -2,7 +2,7 @@ define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) { ; CHECK-LABEL: @test_atomic_xchg_i8 -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -12,7 +12,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic ret i8 %res @@ -20,7 +20,7 @@ define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) { ; CHECK-LABEL: @test_atomic_add_i16 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr) @@ -31,7 +31,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i16 [[OLDVAL]] %res = atomicrmw add i16* %ptr, i16 %addend seq_cst ret i16 %res @@ -39,7 +39,7 @@ define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) { ; CHECK-LABEL: @test_atomic_sub_i32 -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr) @@ -48,7 +48,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence acquire +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i32 [[OLDVAL]] %res = atomicrmw sub i32* %ptr, i32 %subend acquire ret i32 %res @@ -56,7 +56,7 @@ define i8 @test_atomic_and_i8(i8* %ptr, i8 %andend) { ; CHECK-LABEL: @test_atomic_and_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -67,7 +67,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw and i8* %ptr, i8 %andend release ret i8 %res @@ -75,7 +75,7 @@ define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) { ; CHECK-LABEL: @test_atomic_nand_i16 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr) @@ -87,7 +87,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i16 [[OLDVAL]] %res = atomicrmw nand i16* %ptr, i16 %nandend seq_cst ret i16 %res @@ -95,7 +95,7 @@ define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) { ; CHECK-LABEL: @test_atomic_or_i64 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8* @@ -115,7 +115,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i64 [[OLDVAL]] %res = atomicrmw or i64* %ptr, i64 %orend seq_cst ret i64 %res @@ -123,7 +123,7 @@ define i8 @test_atomic_xor_i8(i8* %ptr, i8 %xorend) { ; CHECK-LABEL: @test_atomic_xor_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -134,7 +134,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw xor i8* %ptr, i8 %xorend seq_cst ret i8 %res @@ -142,7 +142,7 @@ define i8 @test_atomic_max_i8(i8* %ptr, i8 %maxend) { ; CHECK-LABEL: @test_atomic_max_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -154,7 +154,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw max i8* %ptr, i8 %maxend seq_cst ret i8 %res @@ -162,7 +162,7 @@ define i8 @test_atomic_min_i8(i8* %ptr, i8 %minend) { ; CHECK-LABEL: @test_atomic_min_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -174,7 +174,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw min i8* %ptr, i8 %minend seq_cst ret i8 %res @@ -182,7 +182,7 @@ define i8 @test_atomic_umax_i8(i8* %ptr, i8 %umaxend) { ; CHECK-LABEL: @test_atomic_umax_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -194,7 +194,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw umax i8* %ptr, i8 %umaxend seq_cst ret i8 %res @@ -202,7 +202,7 @@ define i8 @test_atomic_umin_i8(i8* %ptr, i8 %uminend) { ; CHECK-LABEL: @test_atomic_umin_i8 -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr) @@ -214,7 +214,7 @@ ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]] ; CHECK: [[END]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: ret i8 [[OLDVAL]] %res = atomicrmw umin i8* %ptr, i8 %uminend seq_cst ret i8 %res @@ -222,7 +222,7 @@ define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) { ; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: @@ -238,11 +238,11 @@ ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]] ; CHECK: [[SUCCESS_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[DONE:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[DONE]] ; CHECK: [[DONE]]: @@ -256,7 +256,7 @@ define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) { ; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: @@ -272,11 +272,11 @@ ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]] ; CHECK: [[SUCCESS_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[DONE:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[DONE]] ; CHECK: [[DONE]]: @@ -290,7 +290,7 @@ define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) { ; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: @@ -304,11 +304,11 @@ ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]] ; CHECK: [[SUCCESS_BB]]: -; CHECK: fence acquire +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[DONE:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK: fence acquire +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[DONE]] ; CHECK: [[DONE]]: @@ -322,7 +322,7 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) { ; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: @@ -347,11 +347,11 @@ ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]] ; CHECK: [[SUCCESS_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[DONE:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[DONE]] ; CHECK: [[DONE]]: Index: test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll =================================================================== --- test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll +++ test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll @@ -2,7 +2,8 @@ define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) { ; CHECK-LABEL: @test_cmpxchg_seq_cst -; CHECK: fence release +; Intrinsic for "dmb ishst" is then expected +; CHECK: call void @llvm.arm.dmb(i32 10) ; CHECK: br label %[[START:.*]] ; CHECK: [[START]]: @@ -16,11 +17,11 @@ ; CHECK: br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]] ; CHECK: [[SUCCESS_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[END:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[END]] ; CHECK: [[END]]: @@ -34,7 +35,7 @@ define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) { ; CHECK-LABEL: @test_cmpxchg_weak_fail -; CHECK: fence release +; CHECK: call void @llvm.arm.dmb(i32 10) ; CHECK: br label %[[START:.*]] ; CHECK: [[START]]: @@ -48,11 +49,11 @@ ; CHECK: br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]] ; CHECK: [[SUCCESS_BB]]: -; CHECK: fence seq_cst +; CHECK: call void @llvm.arm.dmb(i32 11) ; CHECK: br label %[[END:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[END]] ; CHECK: [[END]]: @@ -66,7 +67,7 @@ define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) { ; CHECK-LABEL: @test_cmpxchg_monotonic -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[START:.*]] ; CHECK: [[START]]: @@ -80,11 +81,11 @@ ; CHECK: br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]] ; CHECK: [[SUCCESS_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[END:.*]] ; CHECK: [[FAILURE_BB]]: -; CHECK-NOT: fence +; CHECK-NOT: dmb ; CHECK: br label %[[END]] ; CHECK: [[END]]: