diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2114,6 +2114,11 @@ function with a tail call. The prototype of a thunk should not be used for optimization purposes. The caller is expected to cast the thunk prototype to match the thunk target prototype. + +``"tls-load-hoist"`` + This attribute indicates that the function will try to reduce redundant + tls address caculation by hoisting tls variable. + ``uwtable[(sync|async)]`` This attribute indicates that the ABI being targeted requires that an unwind table entry be produced for this function even if we can diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def --- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -47,6 +47,7 @@ FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ()) FUNCTION_PASS("lowerinvoke", LowerInvokePass, ()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ()) +FUNCTION_PASS("tlshoist", TLSVariableHoistPass, ()) FUNCTION_PASS("verify", VerifierPass, ()) #undef FUNCTION_PASS diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -445,6 +445,7 @@ void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); void initializeThreadSanitizerLegacyPassPass(PassRegistry&); +void initializeTLSVariableHoistLegacyPassPass(PassRegistry &); void initializeTwoAddressInstructionPassPass(PassRegistry&); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeTypePromotionPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -177,6 +177,7 @@ (void) llvm::createStripDeadDebugInfoPass(); (void) llvm::createStripDeadPrototypesPass(); (void) llvm::createTailCallEliminationPass(); + (void)llvm::createTLSVariableHoistPass(); (void) llvm::createJumpThreadingPass(); (void) llvm::createDFAJumpThreadingPass(); (void) llvm::createUnifyFunctionExitNodesPass(); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -427,6 +427,12 @@ // "block_weights" metadata. FunctionPass *createLowerExpectIntrinsicPass(); +//===----------------------------------------------------------------------===// +// +// TLSVariableHoist - This pass reduce duplicated TLS address call. +// +FunctionPass *createTLSVariableHoistPass(); + //===----------------------------------------------------------------------===// // // LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and diff --git a/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h @@ -0,0 +1,131 @@ +//==- TLSVariableHoist.h ------ Remove Redundant TLS Loads -------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminates Redundant TLS Loads if related option is set. +// For example: +// static __thread int x; +// int g(); +// int f(int c) { +// int *px = &x; +// while (c--) +// *px += g(); +// return *px; +// } +// +// will generate Redundant TLS Loads by compiling it with +// clang++ -fPIC -ftls-model=global-dynamic -O2 -S +// +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// movl %eax, %ebp +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// addl _ZL1x@DTPOFF(%rax), %ebp +// movl %ebp, _ZL1x@DTPOFF(%rax) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// .LBB0_4: # %entry.while.end_crit_edge +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// movl _ZL1x@DTPOFF(%rax), %ebp +// +// The Redundant TLS Loads will hurt the performance, especially in loops. +// So we try to eliminate/move them if required by customers, let it be: +// +// # %bb.0: # %entry +// ... +// movl %edi, %ebx +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq _ZL1x@DTPOFF(%rax), %r14 +// testl %ebx, %ebx +// je .LBB0_1 +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// addl (%r14), %eax +// movl %eax, (%r14) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H +#define LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Function; +class GlobalVariable; +class Instruction; + +/// A private "module" namespace for types and utilities used by +/// TLSVariableHoist. These are implementation details and should +/// not be used by clients. +namespace tlshoist { + +/// Keeps track of the user of a TLS variable and the operand index +/// where the variable is used. +struct TLSUser { + Instruction *Inst; + unsigned OpndIdx; + + TLSUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) {} +}; + +/// Keeps track of a TLS variable candidate and its users. +struct TLSCandidate { + SmallVector Users; + + /// Add the user to the use list and update the cost. + void addUser(Instruction *Inst, unsigned Idx) { + Users.push_back(TLSUser(Inst, Idx)); + } +}; + +} // end namespace tlshoist + +class TLSVariableHoistPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + // Glue for old PM. + bool runImpl(Function &F, DominatorTree &DT, LoopInfo &LI); + +private: + DominatorTree *DT; + LoopInfo *LI; + + /// Keeps track of TLS variable candidates found in the function. + using TLSCandMapType = MapVector; + TLSCandMapType TLSCandMap; + + void collectTLSCandidates(Function &Fn); + void collectTLSCandidate(Instruction *Inst); + Instruction *getNearestLoopDomInst(BasicBlock *BB, Loop *L); + Instruction *getDomInst(Instruction *I1, Instruction *I2); + BasicBlock::iterator findInsertPos(Function &Fn, GlobalVariable *GV, + BasicBlock *&PosBB); + Instruction *genBitCastInst(Function &Fn, GlobalVariable *GV); + bool tryReplaceTLSCandidates(Function &Fn); + bool tryReplaceTLSCandidate(Function &Fn, GlobalVariable *GV); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -922,6 +922,9 @@ // Allow disabling it for testing purposes. if (!DisableExpandReductions) addPass(createExpandReductionsPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(createTLSVariableHoistPass()); } /// Turn exception handling constructs into something the code generators can diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -212,6 +212,7 @@ #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/Transforms/Scalar/StructurizeCFG.h" +#include "llvm/Transforms/Scalar/TLSVariableHoist.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -361,6 +361,7 @@ FUNCTION_PASS("verify", ScalarEvolutionVerifierPass()) FUNCTION_PASS("view-cfg", CFGViewerPass()) FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass()) +FUNCTION_PASS("tlshoist", TLSVariableHoistPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) FUNCTION_PASS("memprof", MemProfilerPass()) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -77,6 +77,7 @@ StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + TLSVariableHoist.cpp WarnMissedTransforms.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -101,6 +101,7 @@ initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeTLSVariableHoistLegacyPassPass(Registry); initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReduceLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp @@ -0,0 +1,313 @@ +//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminate Redundant TLS Loads if related option is set. +// The example: Please refer to the comment at the head of TLSVariableHoist.h. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/TLSVariableHoist.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace tlshoist; + +#define DEBUG_TYPE "tlshoist" + +// TODO: Support "strict" model if we need to strictly load TLS address, +// because "non-optimize" may also do some optimization in other passes. +static cl::opt TLSLoadHoist( + "tls-load-hoist", + cl::desc( + "hoist the TLS loads in PIC model: " + "tls-load-hoist=optimize: Eleminate redundant TLS load(s)." + "tls-load-hoist=strict: Strictly load TLS address before every use." + "tls-load-hoist=non-optimize: Generally load TLS before use(s)."), + cl::init("non-optimize"), cl::Hidden); + +namespace { + +/// The TLS Variable hoist pass. +class TLSVariableHoistLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + TLSVariableHoistLegacyPass() : FunctionPass(ID) { + initializeTLSVariableHoistLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "TLS Variable Hoist"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } + +private: + TLSVariableHoistPass Impl; +}; + +} // end anonymous namespace + +char TLSVariableHoistLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) + +FunctionPass *llvm::createTLSVariableHoistPass() { + return new TLSVariableHoistLegacyPass(); +} + +/// Perform the TLS Variable Hoist optimization for the given function. +bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n"); + LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + bool MadeChange = + Impl.runImpl(Fn, getAnalysis().getDomTree(), + getAnalysis().getLoopInfo()); + + if (MadeChange) { + LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: " + << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << Fn); + } + LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n"); + + return MadeChange; +} + +void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + auto *GV = dyn_cast(Inst->getOperand(Idx)); + if (!GV || !GV->isThreadLocal()) + continue; + + // Add Candidate to TLSCandMap (GV --> Candidate). + TLSCandMap[GV].addUser(Inst, Idx); + } +} + +void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) { + // First, quickly check if there is TLS Variable. + Module *M = Fn.getParent(); + + bool HasTLS = llvm::any_of( + M->globals(), [](GlobalVariable &GV) { return GV.isThreadLocal(); }); + + // If non, directly return. + if (!HasTLS) + return; + + TLSCandMap.clear(); + + // Then, collect TLS Variable info. + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; + + for (Instruction &Inst : BB) + collectTLSCandidate(&Inst); + } +} + +static bool oneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) { + if (Cand.Users.size() != 1) + return false; + + BasicBlock *BB = Cand.Users[0].Inst->getParent(); + if (LI->getLoopFor(BB)) + return false; + + return true; +} + +Instruction *TLSVariableHoistPass::getNearestLoopDomInst(BasicBlock *BB, + Loop *L) { + assert(L && "Unexcepted Loop status!"); + + // Get the outermost loop. + while (Loop *Parent = L->getParentLoop()) + L = Parent; + + BasicBlock *PreHeader = L->getLoopPreheader(); + + // There is unique predecessor outside the loop. + if (PreHeader) + return PreHeader->getTerminator(); + + BasicBlock *Header = L->getHeader(); + BasicBlock *Dom = Header; + for (BasicBlock *PredBB : predecessors(Header)) + Dom = DT->findNearestCommonDominator(Dom, PredBB); + + assert(Dom && "Not find dominator BB!"); + Instruction *Term = Dom->getTerminator(); + + return Term; +} + +Instruction *TLSVariableHoistPass::getDomInst(Instruction *I1, + Instruction *I2) { + if (!I1) + return I2; + if (DT->dominates(I1, I2)) + return I1; + if (DT->dominates(I2, I1)) + return I2; + + // If there is no dominance relation, use common dominator. + BasicBlock *DomBB = + DT->findNearestCommonDominator(I1->getParent(), I2->getParent()); + + Instruction *Dom = DomBB->getTerminator(); + assert(Dom && "Common dominator not found!"); + + return Dom; +} + +BasicBlock::iterator TLSVariableHoistPass::findInsertPos(Function &Fn, + GlobalVariable *GV, + BasicBlock *&PosBB) { + tlshoist::TLSCandidate &Cand = TLSCandMap[GV]; + + // We should hoist the TLS use out of loop, so choose its nearest instruction + // which dominate the loop and the outside loops (if exist). + Instruction *LastPos = nullptr; + for (auto &User : Cand.Users) { + BasicBlock *BB = User.Inst->getParent(); + Instruction *Pos = User.Inst; + if (Loop *L = LI->getLoopFor(BB)) { + Pos = getNearestLoopDomInst(BB, L); + assert(Pos && "Not find insert position out of loop!"); + } + Pos = getDomInst(LastPos, Pos); + LastPos = Pos; + } + + assert(LastPos && "Unexpected insert position!"); + BasicBlock *Parent = LastPos->getParent(); + PosBB = Parent; + return LastPos->getIterator(); +} + +// Generate a bitcast (no type change) to replace the uses of TLS Candidate. +Instruction *TLSVariableHoistPass::genBitCastInst(Function &Fn, + GlobalVariable *GV) { + BasicBlock *PosBB = &Fn.getEntryBlock(); + BasicBlock::iterator Iter = findInsertPos(Fn, GV, PosBB); + Type *Ty = GV->getType(); + auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast"); + PosBB->getInstList().insert(Iter, CastInst); + return CastInst; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidate(Function &Fn, + GlobalVariable *GV) { + + tlshoist::TLSCandidate &Cand = TLSCandMap[GV]; + + // If only used 1 time and not in loops, we no need to replace it. + if (oneUseOutsideLoop(Cand, LI)) + return false; + + // Generate a bitcast (no type change) + auto *CastInst = genBitCastInst(Fn, GV); + + // to replace the uses of TLS Candidate + for (auto &User : Cand.Users) + User.Inst->setOperand(User.OpndIdx, CastInst); + + return true; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) { + if (TLSCandMap.empty()) + return false; + + bool Replaced = false; + for (auto &GV2Cand : TLSCandMap) { + GlobalVariable *GV = GV2Cand.first; + Replaced |= tryReplaceTLSCandidate(Fn, GV); + } + + return Replaced; +} + +/// Optimize expensive TLS variables in the given function. +bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT, + LoopInfo &LI) { + if (Fn.hasOptNone()) + return false; + + if (TLSLoadHoist != "optimize" && + !Fn.getAttributes().hasFnAttr("tls-load-hoist")) + return false; + + this->LI = &LI; + this->DT = &DT; + assert(this->LI && this->DT && "Unexcepted requirement!"); + + // Collect all TLS variable candidates. + collectTLSCandidates(Fn); + + bool MadeChange = tryReplaceTLSCandidates(Fn); + + return MadeChange; +} + +PreservedAnalyses TLSVariableHoistPass::run(Function &F, + FunctionAnalysisManager &AM) { + + auto &LI = AM.getResult(F); + auto &DT = AM.getResult(F); + + if (!runImpl(F, DT, LI)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -61,6 +61,8 @@ ; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: TLS Variable Hoist ; CHECK-NEXT: Stack Safety Analysis ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -214,6 +214,8 @@ ; GCN-O1-NEXT: Expand vector predication intrinsics ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: TLS Variable Hoist ; GCN-O1-NEXT: AMDGPU Attributor ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -484,6 +486,8 @@ ; GCN-O1-OPTS-NEXT: Expand vector predication intrinsics ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: TLS Variable Hoist ; GCN-O1-OPTS-NEXT: Early CSE ; GCN-O1-OPTS-NEXT: AMDGPU Attributor ; GCN-O1-OPTS-NEXT: CallGraph Construction @@ -769,6 +773,8 @@ ; GCN-O2-NEXT: Expand vector predication intrinsics ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: TLS Variable Hoist ; GCN-O2-NEXT: Early CSE ; GCN-O2-NEXT: AMDGPU Attributor ; GCN-O2-NEXT: CallGraph Construction @@ -1062,6 +1068,7 @@ ; GCN-O3-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O3-NEXT: Expand reduction intrinsics ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: TLS Variable Hoist ; GCN-O3-NEXT: Phi Values Analysis ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -41,6 +41,7 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: TLS Variable Hoist ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -65,6 +65,7 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: TLS Variable Hoist ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -60,6 +60,8 @@ ; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: TLS Variable Hoist ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: X86 Partial Reduction ; CHECK-NEXT: Expand indirectbr instructions diff --git a/llvm/test/CodeGen/X86/tls-loads-control.ll b/llvm/test/CodeGen/X86/tls-loads-control.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/tls-loads-control.ll @@ -0,0 +1,248 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize --stop-after=tlshoist -o - %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --stop-after=tlshoist -o - %s | FileCheck %s + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm) + +; // Variable declaration and definition: +; thread_local int thl_x; +; thread_local int thl_x2; +; +; struct SS { +; char thl_c; +; int num; +; }; +; +; int gfunc(); +; int gfunc2(int); + +; // First function (@_Z2f1i): +; int f1(int c) { +; while (c) +; c++; +; +; int *px = &thl_x; +; c -= gfunc(); +; +; while(c++) { +; c = gfunc(); +; while (c--) +; *px += gfunc2(thl_x2); +; } +; return *px; +; } + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define noundef i32 @_Z2f1i(i32 noundef %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f1i +; CHECK: entry: +; CHECK-NEXT: %call = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %phi.cmp = icmp eq i32 %call, 0 +; CHECK-NEXT: %tls_bitcast1 = bitcast i32* @thl_x to i32* +; CHECK-NEXT: br i1 %phi.cmp, label %while.end11, label %while.body4.preheader + +; CHECK: while.body4.preheader: +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x2 to i32* +; CHECK-NEXT: br label %while.body4 + +; CHECK: while.body4: +; CHECK-NEXT: %call5 = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %tobool7.not18 = icmp eq i32 %call5, 0 +; CHECK-NEXT: br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8.preheader + +; CHECK: while.body8.preheader: +; CHECK-NEXT: br label %while.body8 + +; CHECK: while.body4.backedge.loopexit: +; CHECK-NEXT: br label %while.body4.backedge + +; CHECK: while.body4.backedge: +; CHECK-NEXT: br label %while.body4, !llvm.loop !4 + +; CHECK: while.body8: +; CHECK-NEXT: %c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body8.preheader ] +; CHECK-NEXT: %dec = add i32 %c.addr.219, -1 +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4 +; CHECK-NEXT: %call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0) +; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast1, align 4 +; CHECK-NEXT: %add = add nsw i32 %1, %call9 +; CHECK-NEXT: store i32 %add, i32* %tls_bitcast1, align 4 +; CHECK-NEXT: %tobool7.not = icmp eq i32 %dec, 0 +; CHECK-NEXT: br i1 %tobool7.not, label %while.body4.backedge.loopexit, label %while.body8, !llvm.loop !4 + +; CHECK: while.end11: +; CHECK-NEXT: %2 = load i32, i32* %tls_bitcast1, align 4 +; CHECK-NEXT: ret i32 %2 + +entry: + %call = tail call noundef i32 @_Z5gfuncv() + %phi.cmp = icmp eq i32 %call, 0 + br i1 %phi.cmp, label %while.end11, label %while.body4 + +while.body4: ; preds = %entry, %while.body4.backedge + %call5 = tail call noundef i32 @_Z5gfuncv() + %tobool7.not18 = icmp eq i32 %call5, 0 + br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8 + +while.body4.backedge: ; preds = %while.body8, %while.body4 + br label %while.body4, !llvm.loop !4 + +while.body8: ; preds = %while.body4, %while.body8 + %c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body4 ] + %dec = add nsw i32 %c.addr.219, -1 + %0 = load i32, i32* @thl_x2, align 4 + %call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0) + %1 = load i32, i32* @thl_x, align 4 + %add = add nsw i32 %1, %call9 + store i32 %add, i32* @thl_x, align 4 + %tobool7.not = icmp eq i32 %dec, 0 + br i1 %tobool7.not, label %while.body4.backedge, label %while.body8, !llvm.loop !4 + +while.end11: ; preds = %entry + %2 = load i32, i32* @thl_x, align 4 + ret i32 %2 +} + +; // Sencond function (@_Z2f2i): +; int f2(int c) { +; thread_local struct SS st; +; c += gfunc(); +; while (c--) { +; thl_x += gfunc(); +; st.thl_c += (char)gfunc(); +; st.num += gfunc(); +; } +; return thl_x; +; } +declare noundef i32 @_Z5gfuncv() local_unnamed_addr #1 + +declare noundef i32 @_Z6gfunc2i(i32 noundef) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define noundef i32 @_Z2f2i(i32 noundef %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f2i +; CHECK: entry: +; CHECK-NEXT: %call = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %add = add nsw i32 %call, %c +; CHECK-NEXT: %tobool.not12 = icmp eq i32 %add, 0 +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CHECK-NEXT: br i1 %tobool.not12, label %while.end, label %while.body.preheader + +; CHECK: while.body.preheader: +; CHECK-NEXT: %tls_bitcast1 = bitcast i8* @_ZZ2f2iE2st.0 to i8* +; CHECK-NEXT: %tls_bitcast2 = bitcast i32* @_ZZ2f2iE2st.1 to i32* +; CHECK-NEXT: br label %while.body + +; CHECK: while.body: +; CHECK-NEXT: %c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %while.body.preheader ] +; CHECK-NEXT: %dec = add i32 %c.addr.013, -1 +; CHECK-NEXT: %call1 = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4 +; CHECK-NEXT: %add2 = add nsw i32 %0, %call1 +; CHECK-NEXT: store i32 %add2, i32* %tls_bitcast, align 4 +; CHECK-NEXT: %call3 = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %1 = load i8, i8* %tls_bitcast1, align 4 +; CHECK-NEXT: %2 = trunc i32 %call3 to i8 +; CHECK-NEXT: %conv7 = add i8 %1, %2 +; CHECK-NEXT: store i8 %conv7, i8* %tls_bitcast1, align 4 +; CHECK-NEXT: %call8 = tail call noundef i32 @_Z5gfuncv() +; CHECK-NEXT: %3 = load i32, i32* %tls_bitcast2, align 4 +; CHECK-NEXT: %add9 = add nsw i32 %3, %call8 +; CHECK-NEXT: store i32 %add9, i32* %tls_bitcast2, align 4 +; CHECK-NEXT: %tobool.not = icmp eq i32 %dec, 0 +; CHECK-NEXT: br i1 %tobool.not, label %while.end.loopexit, label %while.body + +; CHECK: while.end.loopexit: +; CHECK-NEXT: br label %while.end + +; CHECK: while.end: +; CHECK-NEXT: %4 = load i32, i32* %tls_bitcast, align 4 +; CHECK-NEXT: ret i32 %4 +entry: + %call = tail call noundef i32 @_Z5gfuncv() + %add = add nsw i32 %call, %c + %tobool.not12 = icmp eq i32 %add, 0 + br i1 %tobool.not12, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %entry ] + %dec = add nsw i32 %c.addr.013, -1 + %call1 = tail call noundef i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4 + %add2 = add nsw i32 %0, %call1 + store i32 %add2, i32* @thl_x, align 4 + %call3 = tail call noundef i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4 + %2 = trunc i32 %call3 to i8 + %conv7 = add i8 %1, %2 + store i8 %conv7, i8* @_ZZ2f2iE2st.0, align 4 + %call8 = tail call noundef i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4 + %add9 = add nsw i32 %3, %call8 + store i32 %add9, i32* @_ZZ2f2iE2st.1, align 4 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4 + ret i32 %4 +} + +; // Third function (@_Z2f3i): +; int f3(int c) { +; int *px = &thl_x; +; gfunc2(*px); +; gfunc2(*px); +; return 1; +; } + +; Function Attrs: mustprogress uwtable +define noundef i32 @_Z2f3i(i32 noundef %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f3i +; CHECK: entry: +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4 +; CHECK-NEXT: %call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0) +; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast, align 4 +; CHECK-NEXT: %call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1) +; CHECK-NEXT: ret i32 1 +entry: + %0 = load i32, i32* @thl_x, align 4 + %call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0) + %1 = load i32, i32* @thl_x, align 4 + %call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1) + ret i32 1 +} + +; Function Attrs: uwtable +define weak_odr hidden noundef i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +; Function Attrs: uwtable +define weak_odr hidden noundef i32* @_ZTW6thl_x2() local_unnamed_addr #2 comdat { + ret i32* @thl_x2 +} + +attributes #0 = { mustprogress uwtable "tls-load-hoist" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 2} +!3 = !{!"clang version 15.0.0"} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/CodeGen/X86/tls-loads-control2.ll b/llvm/test/CodeGen/X86/tls-loads-control2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/tls-loads-control2.ll @@ -0,0 +1,51 @@ +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2 + +$_ZTW5thl_x = comdat any + +@thl_x = thread_local global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* @thl_x, align 4 + %call = tail call i32 @_Z5gfunci(i32 %0) + %1 = load i32, i32* @thl_x, align 4 + %call1 = tail call i32 @_Z5gfunci(i32 %1) + ret i32 1 +} + +;HOIST0-LABEL: _Z2f1i +;HOIST0: entry: +;HOIST0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +;HOIST0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4 +;HOIST0-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;HOIST0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4 +;HOIST0-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;HOIST0-NEXT: ret i32 1 + +;HOIST2-LABEL: _Z2f1i +;HOIST2: entry: +;HOIST2-NEXT: %0 = load i32, i32* @thl_x, align 4 +;HOIST2-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;HOIST2-NEXT: %1 = load i32, i32* @thl_x, align 4 +;HOIST2-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;HOIST2-NEXT: ret i32 1 + +declare i32 @_Z5gfunci(i32) local_unnamed_addr #1 + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} diff --git a/llvm/test/CodeGen/X86/tls-loads-control3.ll b/llvm/test/CodeGen/X86/tls-loads-control3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/tls-loads-control3.ll @@ -0,0 +1,358 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2 + +; This test has no module flag {"tls-load-hoist", i32 0}, so use --tls-load-hoist=x +; to choose the way of loading thread_local address. + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm) + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; For HOIST0, check call __tls_get_addr@PLT only one time for each thread_local variable. +; For HOIST2, Check the default way: usually call __tls_get_addr@PLT every time when use thread_local variable. + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f1i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: pushq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: .cfi_offset %rbx, -32 +; HOIST0-NEXT: .cfi_offset %r14, -24 +; HOIST0-NEXT: .cfi_offset %r15, -16 +; HOIST0-NEXT: movl %edi, %ebx +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r14 +; HOIST0-NEXT: testl %ebx, %ebx +; HOIST0-NEXT: je .LBB0_4 +; HOIST0-NEXT: # %bb.1: # %while.body.preheader +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r15 +; HOIST0-NEXT: .p2align 4, 0x90 +; HOIST0-NEXT: .LBB0_2: # %while.body +; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST0-NEXT: movl (%r15), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: addl (%r14), %eax +; HOIST0-NEXT: movl %eax, (%r14) +; HOIST0-NEXT: decl %ebx +; HOIST0-NEXT: jne .LBB0_2 +; HOIST0-NEXT: jmp .LBB0_3 +; HOIST0-NEXT: .LBB0_4: # %entry.while.end_crit_edge +; HOIST0-NEXT: movl (%r14), %eax +; HOIST0-NEXT: .LBB0_3: # %while.end +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: popq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: popq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f1i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: pushq %rax +; HOIST2-NEXT: .cfi_def_cfa_offset 32 +; HOIST2-NEXT: .cfi_offset %rbx, -24 +; HOIST2-NEXT: .cfi_offset %rbp, -16 +; HOIST2-NEXT: testl %edi, %edi +; HOIST2-NEXT: je .LBB0_4 +; HOIST2-NEXT: # %bb.1: +; HOIST2-NEXT: movl %edi, %ebx +; HOIST2-NEXT: .p2align 4, 0x90 +; HOIST2-NEXT: .LBB0_2: # %while.body +; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: addl (%rax), %ebp +; HOIST2-NEXT: movl %ebp, (%rax) +; HOIST2-NEXT: decl %ebx +; HOIST2-NEXT: jne .LBB0_2 +; HOIST2-NEXT: jmp .LBB0_3 +; HOIST2-NEXT: .LBB0_4: # %entry.while.end_crit_edge +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %ebp +; HOIST2-NEXT: .LBB0_3: # %while.end +; HOIST2-NEXT: movl %ebp, %eax +; HOIST2-NEXT: addq $8, %rsp +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: popq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %tobool.not3 = icmp eq i32 %c, 0 + br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body + +entry.while.end_crit_edge: ; preds = %entry + %.pre = load i32, i32* @thl_x, align 4 + br label %while.end + +while.body: ; preds = %entry, %while.body + %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.04, -1 + %0 = load i32, i32* @thl_x2, align 4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4 + %add = add nsw i32 %1, %call + store i32 %add, i32* @thl_x, align 4 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry.while.end_crit_edge + %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] + ret i32 %2 +} + +declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f2i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: pushq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: pushq %r12 +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 40 +; HOIST0-NEXT: pushq %rax +; HOIST0-NEXT: .cfi_def_cfa_offset 48 +; HOIST0-NEXT: .cfi_offset %rbx, -40 +; HOIST0-NEXT: .cfi_offset %r12, -32 +; HOIST0-NEXT: .cfi_offset %r14, -24 +; HOIST0-NEXT: .cfi_offset %r15, -16 +; HOIST0-NEXT: movl %edi, %ebx +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r14 +; HOIST0-NEXT: testl %ebx, %ebx +; HOIST0-NEXT: je .LBB1_3 +; HOIST0-NEXT: # %bb.1: # %while.body.preheader +; HOIST0-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %rcx +; HOIST0-NEXT: leaq _ZZ2f2iE2st.0@DTPOFF(%rax), %r15 +; HOIST0-NEXT: leaq _ZZ2f2iE2st.1@DTPOFF(%rax), %r12 +; HOIST0-NEXT: .p2align 4, 0x90 +; HOIST0-NEXT: .LBB1_2: # %while.body +; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addl %eax, (%r14) +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addb %al, (%r15) +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addl %eax, (%r12) +; HOIST0-NEXT: decl %ebx +; HOIST0-NEXT: jne .LBB1_2 +; HOIST0-NEXT: .LBB1_3: # %while.end +; HOIST0-NEXT: movl (%r14), %eax +; HOIST0-NEXT: addq $8, %rsp +; HOIST0-NEXT: .cfi_def_cfa_offset 40 +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: popq %r12 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: popq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: popq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f2i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: pushq %r14 +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 32 +; HOIST2-NEXT: .cfi_offset %rbx, -32 +; HOIST2-NEXT: .cfi_offset %r14, -24 +; HOIST2-NEXT: .cfi_offset %rbp, -16 +; HOIST2-NEXT: testl %edi, %edi +; HOIST2-NEXT: je .LBB1_3 +; HOIST2-NEXT: # %bb.1: # %while.body.preheader +; HOIST2-NEXT: movl %edi, %ebx +; HOIST2-NEXT: .p2align 4, 0x90 +; HOIST2-NEXT: .LBB1_2: # %while.body +; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: addl %ebp, (%rax) +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movq %rax, %r14 +; HOIST2-NEXT: addb %bpl, _ZZ2f2iE2st.0@DTPOFF(%rax) +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ecx +; HOIST2-NEXT: movq %r14, %rax +; HOIST2-NEXT: addl %ecx, _ZZ2f2iE2st.1@DTPOFF(%r14) +; HOIST2-NEXT: decl %ebx +; HOIST2-NEXT: jne .LBB1_2 +; HOIST2-NEXT: .LBB1_3: # %while.end +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %eax +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: popq %r14 +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: popq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %tobool.not9 = icmp eq i32 %c, 0 + br i1 %tobool.not9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.010, -1 + %call = tail call i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4 + %add = add nsw i32 %0, %call + store i32 %add, i32* @thl_x, align 4 + %call1 = tail call i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4 + %2 = trunc i32 %call1 to i8 + %conv5 = add i8 %1, %2 + store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4 + %call6 = tail call i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4 + %add7 = add nsw i32 %3, %call6 + store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4 + ret i32 %4 +} + +declare i32 @_Z5gfuncv() local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f3i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: .cfi_offset %rbx, -16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %rbx +; HOIST0-NEXT: movl (%rax), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: movl (%rbx), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: movl $1, %eax +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f3i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: .cfi_offset %rbx, -16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movq %rax, %rbx +; HOIST2-NEXT: movl (%rax), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl (%rbx), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl $1, %eax +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %0 = load i32, i32* @thl_x, align 4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4 + %call1 = tail call i32 @_Z6gfunc2i(i32 %1) + ret i32 1 +} + +attributes #0 = { nounwind mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -369,6 +369,7 @@ initializeHardwareLoopsPass(*Registry); initializeTransformUtils(*Registry); initializeReplaceWithVeclibLegacyPass(*Registry); + initializeTLSVariableHoistLegacyPassPass(*Registry); // Initialize debugging passes. initializeScavengerTestPass(*Registry);