Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -47,6 +47,7 @@ FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ()) FUNCTION_PASS("lowerinvoke", LowerInvokePass, ()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ()) +FUNCTION_PASS("tlshoist", TLSVariableHoistPass, ()) FUNCTION_PASS("verify", VerifierPass, ()) #undef FUNCTION_PASS Index: llvm/include/llvm/IR/Module.h =================================================================== --- llvm/include/llvm/IR/Module.h +++ llvm/include/llvm/IR/Module.h @@ -910,6 +910,10 @@ int getStackProtectorGuardOffset() const; void setStackProtectorGuardOffset(int Offset); + /// Get/set the model of TLS address loading. + StringRef getTlsAddrLoadHoist() const; + void setTlsAddrLoadHoist(StringRef Model); + /// Get/set the stack alignment overridden from the default. unsigned getOverrideStackAlignment() const; void setOverrideStackAlignment(unsigned Align); Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -445,6 +445,7 @@ void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); void initializeThreadSanitizerLegacyPassPass(PassRegistry&); +void initializeTLSVariableHoistLegacyPassPass(PassRegistry &); void initializeTwoAddressInstructionPassPass(PassRegistry&); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeTypePromotionPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -177,6 +177,7 @@ (void) llvm::createStripDeadDebugInfoPass(); (void) llvm::createStripDeadPrototypesPass(); (void) llvm::createTailCallEliminationPass(); + (void)llvm::createTLSVariableHoistPass(); (void) llvm::createJumpThreadingPass(); (void) llvm::createDFAJumpThreadingPass(); (void) llvm::createUnifyFunctionExitNodesPass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -426,6 +426,12 @@ // "block_weights" metadata. FunctionPass *createLowerExpectIntrinsicPass(); +//===----------------------------------------------------------------------===// +// +// TLSVariableHoist - This pass prepares a function for expensive TLSVariable. +// +FunctionPass *createTLSVariableHoistPass(); + //===----------------------------------------------------------------------===// // // LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and Index: llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h @@ -0,0 +1,133 @@ +//==- TLSVariableHoist.h ------ Remove Redundant TLS Loads -----*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminates Redundant TLS Loads if related option is set. +// For example: +// static __thread int x; +// int g(); +// int f(int c) { +// int *px = &x; +// while (c--) +// *px += g(); +// return *px; +// } +// +// will generate Redundant TLS Loads by compiling it with +// Clang++ -fPIC -ftls-model=global-dynamic -O2 -S +// +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// movl %eax, %ebp +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// addl _ZL1x@DTPOFF(%rax), %ebp +// movl %ebp, _ZL1x@DTPOFF(%rax) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// .LBB0_4: # %entry.while.end_crit_edge +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// movl _ZL1x@DTPOFF(%rax), %ebp +// +// The Redundant TLS Loads will hurt the performance, especially in loops. +// So we try to eliminate/move them if required by customers, let it be: +// +// # %bb.0: # %entry +// ... +// movl %edi, %ebx +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq _ZL1x@DTPOFF(%rax), %r14 +// testl %ebx, %ebx +// je .LBB0_1 +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// addl (%r14), %eax +// movl %eax, (%r14) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H +#define LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include +#include +#include + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Function; +class GlobalVariable; +class Instruction; + +/// A private "module" namespace for types and utilities used by +/// TLSVariableHoist. These are implementation details and should +/// not be used by clients. +namespace tlshoist { + +/// Keeps track of the user of a TLS variable and the operand index +/// where the variable is used. +struct TLSUser { + Instruction *Inst; + unsigned OpndIdx; + + TLSUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) {} +}; + +/// Keeps track of a TLS variable candidate and its users. +struct TLSCandidate { + SmallVector Users; + GlobalVariable *GV; + + /// Add the user to the use list and update the cost. + void addUser(Instruction *Inst, unsigned Idx) { + Users.push_back(TLSUser(Inst, Idx)); + } +}; + +} // end namespace tlshoist + +class TLSVariableHoistPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + // Glue for old PM. + bool runImpl(Function &F, DominatorTree &DT, LoopInfo &LI); + +private: + DominatorTree *DT; + LoopInfo *LI; + + /// Keeps track of TLS variable candidates found in the function. + using TLSCandMapType = std::map; + TLSCandMapType TLSCandMap; + + /// Use GVs to make sure the order of TLS Global Varibles. + SmallVector GVs; + + void collectTLSCandidates(Function &Fn); + void collectTLSCandidate(Instruction *Inst); + bool tryReplaceTLSCandidates(Function &Fn); + bool tryReplaceTLSCandidate(Function &Fn, GlobalVariable *GV); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -922,6 +922,9 @@ // Allow disabling it for testing purposes. if (!DisableExpandReductions) addPass(createExpandReductionsPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(createTLSVariableHoistPass()); } /// Turn exception handling constructs into something the code generators can Index: llvm/lib/IR/Module.cpp =================================================================== --- llvm/lib/IR/Module.cpp +++ llvm/lib/IR/Module.cpp @@ -726,6 +726,18 @@ addModuleFlag(ModFlagBehavior::Error, "stack-protector-guard-offset", Offset); } +StringRef Module::getTlsAddrLoadHoist() const { + Metadata *MD = getModuleFlag("tls-load-hoist"); + if (auto *MDS = dyn_cast_or_null(MD)) + return MDS->getString(); + return {}; +} + +void Module::setTlsAddrLoadHoist(StringRef Model) { + MDString *ID = MDString::get(getContext(), Model); + addModuleFlag(ModFlagBehavior::Error, "tls-load-hoist", ID); +} + unsigned Module::getOverrideStackAlignment() const { Metadata *MD = getModuleFlag("override-stack-alignment"); if (auto *CI = mdconst::dyn_extract_or_null(MD)) Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -212,6 +212,7 @@ #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/Transforms/Scalar/StructurizeCFG.h" +#include "llvm/Transforms/Scalar/TLSVariableHoist.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -361,6 +361,7 @@ FUNCTION_PASS("verify", ScalarEvolutionVerifierPass()) FUNCTION_PASS("view-cfg", CFGViewerPass()) FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass()) +FUNCTION_PASS("tlshoist", TLSVariableHoistPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) FUNCTION_PASS("memprof", MemProfilerPass()) Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -77,6 +77,7 @@ StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + TLSVariableHoist.cpp WarnMissedTransforms.cpp ADDITIONAL_HEADER_DIRS Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -104,6 +104,7 @@ initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeTLSVariableHoistLegacyPassPass(Registry); initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReduceLegacyPassPass(Registry); Index: llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp @@ -0,0 +1,254 @@ +//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminate Redundant TLS Loads if related option is set. +// The exmaple: PLS refer to the comment at the head of TLSVariableHoist.h. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/TLSVariableHoist.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace tlshoist; + +#define DEBUG_TYPE "tlshoist" + +// TODO: Support "Strict" model if we need to strictly load TLS address, +// because "Non-Optimize" may also do some optimization in other passes. +static cl::opt TLSLoadHoist( + "tls-load-hoist", + cl::desc( + "hoist the TLS loads in PIC model: " + "tls-load-hoist=Optimize: Eleminate redundant TLS load(s)." + "tls-load-hoist=Strict: Strictly load TLS address before every use." + "tls-load-hoist=Non-Optimize: Generally load TLS before use(s)."), + cl::init("Non-Optimize"), cl::Hidden); + +namespace { + +/// The TLS Variable hoist pass. +class TLSVariableHoistLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + TLSVariableHoistLegacyPass() : FunctionPass(ID) { + initializeTLSVariableHoistLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "TLS Variable Hoist"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } + +private: + TLSVariableHoistPass Impl; +}; + +} // end anonymous namespace + +char TLSVariableHoistLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) + +FunctionPass *llvm::createTLSVariableHoistPass() { + return new TLSVariableHoistLegacyPass(); +} + +/// Perform the TLS Variable Hoist optimization for the given function. +bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n"); + LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + bool MadeChange = + Impl.runImpl(Fn, getAnalysis().getDomTree(), + getAnalysis().getLoopInfo()); + + if (MadeChange) { + LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: " + << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << Fn); + } + LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n"); + + return MadeChange; +} + +void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + auto *GV = dyn_cast(Inst->getOperand(Idx)); + if (!GV) + continue; + if (!GV->isThreadLocal()) + continue; + + // Add Candidate to TLSCandMap (GV --> Candidate). + if (TLSCandMap.count(GV) == 0) { + tlshoist::TLSCandidate Candidate; + Candidate.addUser(Inst, Idx); + TLSCandMap[GV] = Candidate; + GVs.push_back(GV); + } else { + TLSCandMap[GV].addUser(Inst, Idx); + } + } // end of for all operands +} + +void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) { + // First, quickly check if there is TLS Variable. + Module *M = Fn.getParent(); + bool HasTLS = false; + for (GlobalVariable &GV : M->globals()) { + if (GV.isThreadLocal()) { + HasTLS = true; + break; + } + } + + // If non, directly return. + if (!HasTLS) + return; + + TLSCandMap.clear(); + GVs.clear(); + + // Then, collect TLS Variable info. + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; + + for (Instruction &Inst : BB) + collectTLSCandidate(&Inst); + } +} + +static bool OneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) { + if (Cand.Users.size() != 1) + return false; + + BasicBlock *BB = Cand.Users[0].Inst->getParent(); + if (LI && LI->getLoopFor(BB)) + return false; + + return true; +} + +// Generate a bitcast (no type change) to replace the uses of TLS Candidate. +// TODO: Any better insert position ? +// Currently insert in Entry block (except sched move it to better position) +static Instruction *genBitCastInst(Function &Fn, GlobalVariable *GV) { + BasicBlock &Entry = Fn.getEntryBlock(); + BasicBlock::iterator Iter = Entry.getFirstInsertionPt(); + Type *Ty = GV->getType(); + auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast"); + Entry.getInstList().insert(Iter, CastInst); + return CastInst; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidate( + Function &Fn, GlobalVariable *GV) { + + tlshoist::TLSCandidate &Cand = TLSCandMap[GV]; + + // If only used 1 time and not in loops, we no need to replace it. + if (OneUseOutsideLoop(Cand, LI)) + return false; + + // Generate a bitcast (no type change) + auto *CastInst = genBitCastInst(Fn, GV); + + // to replace the uses of TLS Candidate + for (auto &User : Cand.Users) + User.Inst->setOperand(User.OpndIdx, CastInst); + + return true; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) { + if (TLSCandMap.empty()) + return false; + + bool Replaced = false; + for (auto *GV : GVs) + Replaced = tryReplaceTLSCandidate(Fn, GV) || Replaced; + + return Replaced; +} + +/// Optimize expensive TLS variables in the given function. +bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT, + LoopInfo &LI) { + bool MadeChange = false; + Module *M = Fn.getParent(); + if (TLSLoadHoist != "Optimize" && M->getTlsAddrLoadHoist() != "Optimize") + return MadeChange; + + this->LI = &LI; + this->DT = &DT; + // Collect all TLS variable candidates. + collectTLSCandidates(Fn); + + MadeChange = tryReplaceTLSCandidates(Fn); + + return MadeChange; +} + +PreservedAnalyses TLSVariableHoistPass::run(Function &F, + FunctionAnalysisManager &AM) { + + auto &LI = AM.getResult(F); + auto &DT = AM.getResult(F); + + if (!runImpl(F, DT, LI)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} Index: llvm/test/CodeGen/X86/tls-loads-control.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control.ll @@ -0,0 +1,186 @@ +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=Optimize -o - %s | FileCheck %s +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic -o - %s | FileCheck %s + +; The module flag {"tls-load-hoist", Optimize} is equal with option --tls-load-hoist=Optimize +; Currently --tls-load-hoist=Non-Optimize / {"tls-load-hoist", Non-Optimize } / non module flag +; are default action, non module flag test refer to intel-tls-loads-hoist2.ll. + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -mtls-load-hoist=Optimize -O2 -S -emit-llvm) + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f1i +; CHECK: entry: +; CHECK-NEXT: %tls_bitcast1 = bitcast i32* @thl_x2 to i32* +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CHECK-NEXT: %tobool.not3 = icmp eq i32 %c, 0 +; CHECK-NEXT: br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body +; +; CHECK: entry.while.end_crit_edge: ; preds = %entry +; CHECK-NEXT: %.pre = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: br label %while.end +; +; CHECK: while.body: ; preds = %while.body, %entry +; CHECK-NEXT: %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] +; CHECK-NEXT: %dec = add nsw i32 %c.addr.04, -1 +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast1, align 4, !tbaa !5 +; CHECK-NEXT: %call = tail call i32 @_Z6gfunc2i(i32 %0) +; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %add = add nsw i32 %1, %call +; CHECK-NEXT: store i32 %add, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %tobool.not = icmp eq i32 %dec, 0 +; CHECK-NEXT: br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !9 +; +; CHECK: while.end: ; preds = %while.body, %entry.while.end_crit_edge +; CHECK-NEXT: %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] +; CHECK-NEXT: ret i32 %2 +; CHECK-NEXT: } +entry: + %tobool.not3 = icmp eq i32 %c, 0 + br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body + +entry.while.end_crit_edge: ; preds = %entry + %.pre = load i32, i32* @thl_x, align 4, !tbaa !5 + br label %while.end + +while.body: ; preds = %entry, %while.body + %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.04, -1 + %0 = load i32, i32* @thl_x2, align 4, !tbaa !5 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !5 + %add = add nsw i32 %1, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !9 + +while.end: ; preds = %while.body, %entry.while.end_crit_edge + %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] + ret i32 %2 +} + +declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f2i +; CHECK: entry: +; CHECK-NEXT: %tls_bitcast2 = bitcast i32* @_ZZ2f2iE2st.1 to i32* +; CHECK-NEXT: %tls_bitcast1 = bitcast i8* @_ZZ2f2iE2st.0 to i8* +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CHECK-NEXT: %tobool.not9 = icmp eq i32 %c, 0 +; CHECK-NEXT: br i1 %tobool.not9, label %while.end, label %while.body +; +; CHECK: while.body: ; preds = %while.body, %entry +; CHECK-NEXT: %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] +; CHECK-NEXT: %dec = add nsw i32 %c.addr.010, -1 +; CHECK-NEXT: %call = tail call i32 @_Z5gfuncv() +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %add = add nsw i32 %0, %call +; CHECK-NEXT: store i32 %add, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %call1 = tail call i32 @_Z5gfuncv() +; CHECK-NEXT: %1 = load i8, i8* %tls_bitcast1, align 4, !tbaa !11 +; CHECK-NEXT: %2 = trunc i32 %call1 to i8 +; CHECK-NEXT: %conv5 = add i8 %1, %2 +; CHECK-NEXT: store i8 %conv5, i8* %tls_bitcast1, align 4, !tbaa !11 +; CHECK-NEXT: %call6 = tail call i32 @_Z5gfuncv() +; CHECK-NEXT: %3 = load i32, i32* %tls_bitcast2, align 4, !tbaa !13 +; CHECK-NEXT: %add7 = add nsw i32 %3, %call6 +; CHECK-NEXT: store i32 %add7, i32* %tls_bitcast2, align 4, !tbaa !13 +; CHECK-NEXT: %tobool.not = icmp eq i32 %dec, 0 +; CHECK-NEXT: br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !14 +; +; CHECK: while.end: ; preds = %while.body, %entry +; CHECK-NEXT: %4 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: ret i32 %4 +; CHECK-NEXT: } +entry: + %tobool.not9 = icmp eq i32 %c, 0 + br i1 %tobool.not9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.010, -1 + %call = tail call i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4, !tbaa !5 + %add = add nsw i32 %0, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !5 + %call1 = tail call i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !11 + %2 = trunc i32 %call1 to i8 + %conv5 = add i8 %1, %2 + store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !11 + %call6 = tail call i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !13 + %add7 = add nsw i32 %3, %call6 + store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !13 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !14 + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4, !tbaa !5 + ret i32 %4 +} + +declare i32 @_Z5gfuncv() local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 { +; CHECK-LABEL: _Z2f3i +; CHECK: entry: +; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %call = tail call i32 @_Z6gfunc2i(i32 %0) +; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CHECK-NEXT: %call1 = tail call i32 @_Z6gfunc2i(i32 %1) +; CHECK-NEXT: ret i32 1 +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !5 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !5 + %call1 = tail call i32 @_Z6gfunc2i(i32 %1) + ret i32 1 +} + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW6thl_x2() local_unnamed_addr #2 comdat { + ret i32* @thl_x2 +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{i32 1, !"tls-load-hoist", !"Optimize"} +!4 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = distinct !{!9, !10} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!12, !7, i64 0} +!12 = !{!"_ZTS2SS", !7, i64 0, !6, i64 4} +!13 = !{!12, !6, i64 4} +!14 = distinct !{!14, !10} Index: llvm/test/CodeGen/X86/tls-loads-control2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control2.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=Optimize -o - %s | FileCheck %s --check-prefix=HOIST0 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=Non-Optimize -o - %s | FileCheck %s --check-prefix=HOIST2 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2 + +$_ZTW5thl_x = comdat any + +@thl_x = thread_local global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call = tail call i32 @_Z5gfunci(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z5gfunci(i32 %1) + ret i32 1 +} + +;HOIST0-LABEL: _Z2f1i +;HOIST0: entry: +;HOIST0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +;HOIST0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !4 +;HOIST0-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;HOIST0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !4 +;HOIST0-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;HOIST0-NEXT: ret i32 1 + +;HOIST2-LABEL: _Z2f1i +;HOIST2: entry: +;HOIST2-NEXT: %0 = load i32, i32* @thl_x, align 4, !tbaa !4 +;HOIST2-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;HOIST2-NEXT: %1 = load i32, i32* @thl_x, align 4, !tbaa !4 +;HOIST2-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;HOIST2-NEXT: ret i32 1 + +declare i32 @_Z5gfunci(i32) local_unnamed_addr #1 + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} Index: llvm/test/CodeGen/X86/tls-loads-control3.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control3.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=Optimize -o - %s | FileCheck %s --check-prefix=HOIST0 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=Non-Optimize -o - %s | FileCheck %s --check-prefix=HOIST2 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2 + +; This test has no module flag {"tls-load-hoist", i32 0}, so use --tls-load-hoist=x +; to choose the way of loading thread_local address. + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm) + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; For HOIST0, check call __tls_get_addr@PLT only one time for each thread_local variable. +; For HOIST2, Check the default way: usually call __tls_get_addr@PLT every time when use thread_local variable. + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f1i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: pushq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: .cfi_offset %rbx, -32 +; HOIST0-NEXT: .cfi_offset %r14, -24 +; HOIST0-NEXT: .cfi_offset %r15, -16 +; HOIST0-NEXT: movl %edi, %ebx +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r14 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r15 +; HOIST0-NEXT: testl %ebx, %ebx +; HOIST0-NEXT: je .LBB0_1 +; HOIST0-NEXT: .p2align 4, 0x90 +; HOIST0-NEXT: .LBB0_2: # %while.body +; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST0-NEXT: movl (%r14), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: addl (%r15), %eax +; HOIST0-NEXT: movl %eax, (%r15) +; HOIST0-NEXT: decl %ebx +; HOIST0-NEXT: jne .LBB0_2 +; HOIST0-NEXT: jmp .LBB0_3 +; HOIST0-NEXT: .LBB0_1: # %entry.while.end_crit_edge +; HOIST0-NEXT: movl (%r15), %eax +; HOIST0-NEXT: .LBB0_3: # %while.end +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: popq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: popq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f1i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: pushq %rax +; HOIST2-NEXT: .cfi_def_cfa_offset 32 +; HOIST2-NEXT: .cfi_offset %rbx, -24 +; HOIST2-NEXT: .cfi_offset %rbp, -16 +; HOIST2-NEXT: testl %edi, %edi +; HOIST2-NEXT: je .LBB0_4 +; HOIST2-NEXT: # %bb.1: +; HOIST2-NEXT: movl %edi, %ebx +; HOIST2-NEXT: .p2align 4, 0x90 +; HOIST2-NEXT: .LBB0_2: # %while.body +; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: addl (%rax), %ebp +; HOIST2-NEXT: movl %ebp, (%rax) +; HOIST2-NEXT: decl %ebx +; HOIST2-NEXT: jne .LBB0_2 +; HOIST2-NEXT: jmp .LBB0_3 +; HOIST2-NEXT: .LBB0_4: # %entry.while.end_crit_edge +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %ebp +; HOIST2-NEXT: .LBB0_3: # %while.end +; HOIST2-NEXT: movl %ebp, %eax +; HOIST2-NEXT: addq $8, %rsp +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: popq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %tobool.not3 = icmp eq i32 %c, 0 + br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body + +entry.while.end_crit_edge: ; preds = %entry + %.pre = load i32, i32* @thl_x, align 4, !tbaa !4 + br label %while.end + +while.body: ; preds = %entry, %while.body + %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.04, -1 + %0 = load i32, i32* @thl_x2, align 4, !tbaa !4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %add = add nsw i32 %1, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !4 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !8 + +while.end: ; preds = %while.body, %entry.while.end_crit_edge + %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] + ret i32 %2 +} + +declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f2i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: pushq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: pushq %r12 +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 40 +; HOIST0-NEXT: pushq %rax +; HOIST0-NEXT: .cfi_def_cfa_offset 48 +; HOIST0-NEXT: .cfi_offset %rbx, -40 +; HOIST0-NEXT: .cfi_offset %r12, -32 +; HOIST0-NEXT: .cfi_offset %r14, -24 +; HOIST0-NEXT: .cfi_offset %r15, -16 +; HOIST0-NEXT: movl %edi, %ebx +; HOIST0-NEXT: leaq _ZZ2f2iE2st.1@TLSLD(%rip), %rdi +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r15 +; HOIST0-NEXT: movq %rax, %r12 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %r14 +; HOIST0-NEXT: testl %ebx, %ebx +; HOIST0-NEXT: je .LBB1_3 +; HOIST0-NEXT: # %bb.1: # %while.body.preheader +; HOIST0-NEXT: leaq _ZZ2f2iE2st.1@DTPOFF(%r15), %r15 +; HOIST0-NEXT: leaq _ZZ2f2iE2st.0@DTPOFF(%r12), %r12 +; HOIST0-NEXT: .p2align 4, 0x90 +; HOIST0-NEXT: .LBB1_2: # %while.body +; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addl %eax, (%r14) +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addb %al, (%r12) +; HOIST0-NEXT: callq _Z5gfuncv@PLT +; HOIST0-NEXT: addl %eax, (%r15) +; HOIST0-NEXT: decl %ebx +; HOIST0-NEXT: jne .LBB1_2 +; HOIST0-NEXT: .LBB1_3: # %while.end +; HOIST0-NEXT: movl (%r14), %eax +; HOIST0-NEXT: addq $8, %rsp +; HOIST0-NEXT: .cfi_def_cfa_offset 40 +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 32 +; HOIST0-NEXT: popq %r12 +; HOIST0-NEXT: .cfi_def_cfa_offset 24 +; HOIST0-NEXT: popq %r14 +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: popq %r15 +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f2i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: pushq %r14 +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 32 +; HOIST2-NEXT: .cfi_offset %rbx, -32 +; HOIST2-NEXT: .cfi_offset %r14, -24 +; HOIST2-NEXT: .cfi_offset %rbp, -16 +; HOIST2-NEXT: testl %edi, %edi +; HOIST2-NEXT: je .LBB1_3 +; HOIST2-NEXT: # %bb.1: # %while.body.preheader +; HOIST2-NEXT: movl %edi, %ebx +; HOIST2-NEXT: .p2align 4, 0x90 +; HOIST2-NEXT: .LBB1_2: # %while.body +; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1 +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: addl %ebp, (%rax) +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ebp +; HOIST2-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movq %rax, %r14 +; HOIST2-NEXT: addb %bpl, _ZZ2f2iE2st.0@DTPOFF(%rax) +; HOIST2-NEXT: callq _Z5gfuncv@PLT +; HOIST2-NEXT: movl %eax, %ecx +; HOIST2-NEXT: movq %r14, %rax +; HOIST2-NEXT: addl %ecx, _ZZ2f2iE2st.1@DTPOFF(%r14) +; HOIST2-NEXT: decl %ebx +; HOIST2-NEXT: jne .LBB1_2 +; HOIST2-NEXT: .LBB1_3: # %while.end +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movl (%rax), %eax +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 24 +; HOIST2-NEXT: popq %r14 +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: popq %rbp +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %tobool.not9 = icmp eq i32 %c, 0 + br i1 %tobool.not9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.010, -1 + %call = tail call i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %add = add nsw i32 %0, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !10 + %2 = trunc i32 %call1 to i8 + %conv5 = add i8 %1, %2 + store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !10 + %call6 = tail call i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !12 + %add7 = add nsw i32 %3, %call6 + store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !12 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !13 + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4, !tbaa !4 + ret i32 %4 +} + +declare i32 @_Z5gfuncv() local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 { +; HOIST0-LABEL: _Z2f3i: +; HOIST0: # %bb.0: # %entry +; HOIST0-NEXT: pushq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 16 +; HOIST0-NEXT: .cfi_offset %rbx, -16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST0-NEXT: data16 +; HOIST0-NEXT: data16 +; HOIST0-NEXT: rex64 +; HOIST0-NEXT: callq __tls_get_addr@PLT +; HOIST0-NEXT: movq %rax, %rbx +; HOIST0-NEXT: movl (%rax), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: movl (%rbx), %edi +; HOIST0-NEXT: callq _Z6gfunc2i@PLT +; HOIST0-NEXT: movl $1, %eax +; HOIST0-NEXT: popq %rbx +; HOIST0-NEXT: .cfi_def_cfa_offset 8 +; HOIST0-NEXT: retq +; +; HOIST2-LABEL: _Z2f3i: +; HOIST2: # %bb.0: # %entry +; HOIST2-NEXT: pushq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 16 +; HOIST2-NEXT: .cfi_offset %rbx, -16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; HOIST2-NEXT: data16 +; HOIST2-NEXT: data16 +; HOIST2-NEXT: rex64 +; HOIST2-NEXT: callq __tls_get_addr@PLT +; HOIST2-NEXT: movq %rax, %rbx +; HOIST2-NEXT: movl (%rax), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl (%rbx), %edi +; HOIST2-NEXT: callq _Z6gfunc2i@PLT +; HOIST2-NEXT: movl $1, %eax +; HOIST2-NEXT: popq %rbx +; HOIST2-NEXT: .cfi_def_cfa_offset 8 +; HOIST2-NEXT: retq +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z6gfunc2i(i32 %1) + ret i32 1 +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!11, !6, i64 0} +!11 = !{!"_ZTS2SS", !6, i64 0, !5, i64 4} +!12 = !{!11, !5, i64 4} +!13 = distinct !{!13, !9} Index: llvm/tools/llc/llc.cpp =================================================================== --- llvm/tools/llc/llc.cpp +++ llvm/tools/llc/llc.cpp @@ -369,6 +369,7 @@ initializeHardwareLoopsPass(*Registry); initializeTransformUtils(*Registry); initializeReplaceWithVeclibLegacyPass(*Registry); + initializeTLSVariableHoistLegacyPassPass(*Registry); // Initialize debugging passes. initializeScavengerTestPass(*Registry);