Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -47,6 +47,7 @@ FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ()) FUNCTION_PASS("lowerinvoke", LowerInvokePass, ()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ()) +FUNCTION_PASS("tlscontrol", TLSVariableControlPass, ()) FUNCTION_PASS("verify", VerifierPass, ()) #undef FUNCTION_PASS Index: llvm/include/llvm/IR/Module.h =================================================================== --- llvm/include/llvm/IR/Module.h +++ llvm/include/llvm/IR/Module.h @@ -910,6 +910,10 @@ int getStackProtectorGuardOffset() const; void setStackProtectorGuardOffset(int Offset); + /// Get/set the model of TLS address loading. + StringRef getTlsAddrLoadControl() const; + void setTlsAddrLoadControl(StringRef Model); + /// Get/set the stack alignment overridden from the default. unsigned getOverrideStackAlignment() const; void setOverrideStackAlignment(unsigned Align); Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -445,6 +445,7 @@ void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); void initializeThreadSanitizerLegacyPassPass(PassRegistry&); +void initializeTLSVariableControlLegacyPassPass(PassRegistry &); void initializeTwoAddressInstructionPassPass(PassRegistry&); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeTypePromotionPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -177,6 +177,7 @@ (void) llvm::createStripDeadDebugInfoPass(); (void) llvm::createStripDeadPrototypesPass(); (void) llvm::createTailCallEliminationPass(); + (void)llvm::createTLSVariableControlPass(); (void) llvm::createJumpThreadingPass(); (void) llvm::createDFAJumpThreadingPass(); (void) llvm::createUnifyFunctionExitNodesPass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -426,6 +426,12 @@ // "block_weights" metadata. FunctionPass *createLowerExpectIntrinsicPass(); +//===----------------------------------------------------------------------===// +// +// TLSVariableControl - This pass prepares a function for expensive TLSVariable. +// +FunctionPass *createTLSVariableControlPass(); + //===----------------------------------------------------------------------===// // // LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and Index: llvm/include/llvm/Transforms/Scalar/TLSVariableControl.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/TLSVariableControl.h @@ -0,0 +1,137 @@ +//==- TLSVariableControl.h - Remove Redundant TLS Loads --*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminate Redundant TLS Loads if related option is set. +// For example: +// static __thread int x; +// int g(); +// int f(int c) { +// int *px = &x; +// while (c--) +// *px += g(); +// return *px; +// } +// +// will generated Redundant TLS Loads by compiling it with +// Clang++ -fPIC -ftls-model=global-dynamic -O2 -S +// +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// movl %eax, %ebp +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// addl _ZL1x@DTPOFF(%rax), %ebp +// movl %ebp, _ZL1x@DTPOFF(%rax) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// .LBB0_4: # %entry.while.end_crit_edge +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// movl _ZL1x@DTPOFF(%rax), %ebp +// +// The Redundant TLS Loads will hurt the performance, especially in loops. +// So we try to eliminate/move them if required by customers, let it be: +// +// # %bb.0: # %entry +// ... +// movl %edi, %ebx +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq _ZL1x@DTPOFF(%rax), %r14 +// testl %ebx, %ebx +// je .LBB0_1 +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// addl (%r14), %eax +// movl %eax, (%r14) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_TLSVARIABLECONTROL_H +#define LLVM_TRANSFORMS_SCALAR_TLSVARIABLECONTROL_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include +#include +#include + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Function; +class GlobalVariable; +class Instruction; + +/// A private "module" namespace for types and utilities used by +/// TLSVariableControl. These are implementation details and should +/// not be used by clients. +namespace tlscontrol { + +/// Keeps track of the user of a TLS variable and the operand index +/// where the variable is used. +struct TLSUser { + Instruction *Inst; + unsigned OpndIdx; + + TLSUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) {} +}; + +using TLSUseListType = SmallVector; + +/// Keeps track of a TLS variable candidate and its uses. +struct TLSCandidate { + TLSUseListType Uses; + GlobalVariable *GV; + + TLSCandidate(GlobalVariable *GV = nullptr) : GV(GV) {} + + /// Add the user to the use list and update the cost. + void addUser(Instruction *Inst, unsigned Idx) { + Uses.push_back(TLSUser(Inst, Idx)); + } +}; + +} // end namespace tlscontrol + +class TLSVariableControlPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + // Glue for old PM. + bool runImpl(Function &F, DominatorTree &DT, LoopInfo &LI); + + void collectTLSCandidates(Function &Fn); + void collectTLSCandidate(Instruction *Inst); + bool tryReplaceTLSCandidates(Function &Fn); + bool tryReplaceTLSCandidate(Function &Fn, tlscontrol::TLSCandidate &Cand); + +private: + DominatorTree *DT; + LoopInfo *LI; + + /// Keeps track of TLS variable candidates found in the function. + using TLSCandMapType = std::map; + TLSCandMapType TLSCandMap; + + /// Use GVs to make sure the order of TLS Global Varibles. + SmallVector GVs; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_TLSVARIABLECONTROL_H Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -922,6 +922,9 @@ // Allow disabling it for testing purposes. if (!DisableExpandReductions) addPass(createExpandReductionsPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(createTLSVariableControlPass()); } /// Turn exception handling constructs into something the code generators can Index: llvm/lib/IR/Module.cpp =================================================================== --- llvm/lib/IR/Module.cpp +++ llvm/lib/IR/Module.cpp @@ -726,6 +726,18 @@ addModuleFlag(ModFlagBehavior::Error, "stack-protector-guard-offset", Offset); } +StringRef Module::getTlsAddrLoadControl() const { + Metadata *MD = getModuleFlag("tls-load-control"); + if (auto *MDS = dyn_cast_or_null(MD)) + return MDS->getString(); + return {}; +} + +void Module::setTlsAddrLoadControl(StringRef Model) { + MDString *ID = MDString::get(getContext(), Model); + addModuleFlag(ModFlagBehavior::Error, "tls-load-control", ID); +} + unsigned Module::getOverrideStackAlignment() const { Metadata *MD = getModuleFlag("override-stack-alignment"); if (auto *CI = mdconst::dyn_extract_or_null(MD)) Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -213,6 +213,7 @@ #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/Transforms/Scalar/StructurizeCFG.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/Transforms/Scalar/TLSVariableControl.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -361,6 +361,7 @@ FUNCTION_PASS("verify", ScalarEvolutionVerifierPass()) FUNCTION_PASS("view-cfg", CFGViewerPass()) FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass()) +FUNCTION_PASS("tlscontrol", TLSVariableControlPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) FUNCTION_PASS("memprof", MemProfilerPass()) Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -77,6 +77,7 @@ StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + TLSVariableControl.cpp WarnMissedTransforms.cpp ADDITIONAL_HEADER_DIRS Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -104,6 +104,7 @@ initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeTLSVariableControlLegacyPassPass(Registry); initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReduceLegacyPassPass(Registry); Index: llvm/lib/Transforms/Scalar/TLSVariableControl.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/TLSVariableControl.cpp @@ -0,0 +1,297 @@ +//===- TLSVariableControl.cpp - Remove Redundant TLS Loads --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminate Redundant TLS Loads if related option is set. +// For exmaple: +// static __thread int x; +// int g(); +// int f(int c) { +// int *px = &x; +// while (c--) +// *px += g(); +// return *px; +// } +// +// will generated Redundant TLS Loads by compiling it with +// Clang++ -fPIC -ftls-model=global-dynamic -O2 -S +// +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// movl %eax, %ebp +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// addl _ZL1x@DTPOFF(%rax), %ebp +// movl %ebp, _ZL1x@DTPOFF(%rax) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// .LBB0_4: # %entry.while.end_crit_edge +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// movl _ZL1x@DTPOFF(%rax), %ebp +// +// The Redundant TLS Loads will hurt the performance, especially in loops. +// So we try to eliminate/move them if required by customers, let it be: +// +// # %bb.0: # %entry +// ... +// movl %edi, %ebx +// leaq _ZL1x@TLSLD(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq _ZL1x@DTPOFF(%rax), %r14 +// testl %ebx, %ebx +// je .LBB0_1 +// .LBB0_2: # %while.body +// # =>This Inner Loop Header: Depth=1 +// callq _Z1gv@PLT +// addl (%r14), %eax +// movl %eax, (%r14) +// addl $-1, %ebx +// jne .LBB0_2 +// jmp .LBB0_3 +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/TLSVariableControl.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace tlscontrol; + +#define DEBUG_TYPE "tlscontrol" + +static cl::opt TLSLoadControl( + "tls-load-control", + cl::desc("Control the TLS loads in PIC model: " + "tls-load-control=Optimize: Eleminate remove redundant TLS load(s)." + "tls-load-control=Strict: Strictly load TLS address before every use." + "tls-load-control=Non-Optimize: Generally load TLS before use(s)."), + cl::init("Non-Optimize"), cl::Hidden); + +namespace { + +/// The TLS Variable Control pass. +class TLSVariableControlLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + TLSVariableControlLegacyPass() : FunctionPass(ID) { + initializeTLSVariableControlLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "TLS Variable Control"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } + +private: + TLSVariableControlPass Impl; +}; + +} // end anonymous namespace + +char TLSVariableControlLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(TLSVariableControlLegacyPass, "tlscontrol", + "TLS Variable Control", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(TLSVariableControlLegacyPass, "tlscontrol", + "TLS Variable Control", false, false) + +FunctionPass *llvm::createTLSVariableControlPass() { + return new TLSVariableControlLegacyPass(); +} + +/// Perform the TLS Variable Control optimization for the given function. +bool TLSVariableControlLegacyPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Control **********\n"); + LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + bool MadeChange = + Impl.runImpl(Fn, getAnalysis().getDomTree(), + getAnalysis().getLoopInfo()); + + if (MadeChange) { + LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Control: " + << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << Fn); + } + LLVM_DEBUG(dbgs() << "********** End TLS Variable Control **********\n"); + + return MadeChange; +} + +void TLSVariableControlPass::collectTLSCandidate(Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + auto *GV = dyn_cast(Inst->getOperand(Idx)); + if (!GV) + continue; + if (!GV->isThreadLocal()) + continue; + + // Add Candidate to TLSCandMap (GV --> Candidate). + if (TLSCandMap.count(GV) == 0) { + tlscontrol::TLSCandidate Candidate(GV); + Candidate.addUser(Inst, Idx); + TLSCandMap[GV] = Candidate; + GVs.push_back(GV); + } else { + TLSCandMap[GV].addUser(Inst, Idx); + } + } // end of for all operands +} + +void TLSVariableControlPass::collectTLSCandidates(Function &Fn) { + // First, quickly check if there is TLS Variable. + Module *M = Fn.getParent(); + bool HasTLS = false; + for (GlobalVariable &GV : M->globals()) { + if (GV.isThreadLocal()) { + HasTLS = true; + break; + } + } + + // If non, directly return. + if (!HasTLS) + return; + + TLSCandMap.clear(); + GVs.clear(); + + // Then, collect TLS Variable info. + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; + + for (Instruction &Inst : BB) + collectTLSCandidate(&Inst); + } +} + +static bool OneUseOutsideLoop(tlscontrol::TLSCandidate &Cand, LoopInfo *LI) { + if (Cand.Uses.size() != 1) + return false; + + BasicBlock *BB = Cand.Uses[0].Inst->getParent(); + if (LI && LI->getLoopFor(BB)) + return false; + + return true; +} + +// Generate a bitcast (no type change) to replace the uses of TLS Candidate. +// TODO: Any better insert position ? +// Currently insert in Entry block (except sched move it to better position) +static Instruction *genBitCastInst(Function &Fn, + tlscontrol::TLSCandidate &Cand) { + BasicBlock &Entry = Fn.getEntryBlock(); + BasicBlock::iterator Iter = Entry.getFirstInsertionPt(); + Type *Ty = Cand.GV->getType(); + auto *CastInst = new BitCastInst(Cand.GV, Ty, "tls_bitcast"); + Entry.getInstList().insert(Iter, CastInst); + return CastInst; +} + +bool TLSVariableControlPass::tryReplaceTLSCandidate( + Function &Fn, tlscontrol::TLSCandidate &Cand) { + // If only used 1 time and not in loops, we no need to replace it. + if (OneUseOutsideLoop(Cand, LI)) + return false; + + // Generate a bitcast (no type change) + auto *CastInst = genBitCastInst(Fn, Cand); + + // to replace the uses of TLS Candidate + for (auto &Use : Cand.Uses) + Use.Inst->setOperand(Use.OpndIdx, CastInst); + + return true; +} + +bool TLSVariableControlPass::tryReplaceTLSCandidates(Function &Fn) { + if (TLSCandMap.empty()) + return false; + + bool Replaced = false; + for (auto *GV : GVs) { + Replaced = tryReplaceTLSCandidate(Fn, TLSCandMap[GV]) || Replaced; + } + + return Replaced; +} + +/// Optimize expensive TLS variables in the given function. +bool TLSVariableControlPass::runImpl(Function &Fn, DominatorTree &DT, + LoopInfo &LI) { + bool MadeChange = false; + Module *M = Fn.getParent(); + if (TLSLoadControl != "Optimize" && M->getTlsAddrLoadControl() != "Optimize") + return MadeChange; + + this->LI = &LI; + this->DT = &DT; + // Collect all TLS variable candidates. + collectTLSCandidates(Fn); + + MadeChange = tryReplaceTLSCandidates(Fn); + + return MadeChange; +} + +PreservedAnalyses TLSVariableControlPass::run(Function &F, + FunctionAnalysisManager &AM) { + + auto &LI = AM.getResult(F); + auto &DT = AM.getResult(F); + + if (!runImpl(F, DT, LI)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} Index: llvm/test/CodeGen/X86/tls-loads-control.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control.ll @@ -0,0 +1,186 @@ +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlscontrol --relocation-model=pic --tls-load-control=Optimize -o - %s | FileCheck %s --check-prefixes=CONTROL0 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlscontrol --relocation-model=pic -o - %s | FileCheck %s --check-prefixes=CONTROL0 + +; The module flag {"tls-load-control", Optimize} is equal with option --tls-load-control=Optimize +; Currently --tls-load-control=Non-Optimize / {"tls-load-control", Non-Optimize } / non module flag +; are default action, non module flag test refer to intel-tls-loads-control2.ll. + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -mtls-load-control=Optimize -O2 -S -emit-llvm) + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f1i +; CONTROL0: entry: +; CONTROL0-NEXT: %tls_bitcast1 = bitcast i32* @thl_x2 to i32* +; CONTROL0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CONTROL0-NEXT: %tobool.not3 = icmp eq i32 %c, 0 +; CONTROL0-NEXT: br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body +; +; CONTROL0: entry.while.end_crit_edge: ; preds = %entry +; CONTROL0-NEXT: %.pre = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: br label %while.end +; +; CONTROL0: while.body: ; preds = %while.body, %entry +; CONTROL0-NEXT: %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] +; CONTROL0-NEXT: %dec = add nsw i32 %c.addr.04, -1 +; CONTROL0-NEXT: %0 = load i32, i32* %tls_bitcast1, align 4, !tbaa !5 +; CONTROL0-NEXT: %call = tail call i32 @_Z6gfunc2i(i32 %0) +; CONTROL0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %add = add nsw i32 %1, %call +; CONTROL0-NEXT: store i32 %add, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %tobool.not = icmp eq i32 %dec, 0 +; CONTROL0-NEXT: br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !9 +; +; CONTROL0: while.end: ; preds = %while.body, %entry.while.end_crit_edge +; CONTROL0-NEXT: %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] +; CONTROL0-NEXT: ret i32 %2 +; CONTROL0-NEXT: } +entry: + %tobool.not3 = icmp eq i32 %c, 0 + br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body + +entry.while.end_crit_edge: ; preds = %entry + %.pre = load i32, i32* @thl_x, align 4, !tbaa !5 + br label %while.end + +while.body: ; preds = %entry, %while.body + %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.04, -1 + %0 = load i32, i32* @thl_x2, align 4, !tbaa !5 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !5 + %add = add nsw i32 %1, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !9 + +while.end: ; preds = %while.body, %entry.while.end_crit_edge + %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] + ret i32 %2 +} + +declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f2i +; CONTROL0: entry: +; CONTROL0-NEXT: %tls_bitcast2 = bitcast i32* @_ZZ2f2iE2st.1 to i32* +; CONTROL0-NEXT: %tls_bitcast1 = bitcast i8* @_ZZ2f2iE2st.0 to i8* +; CONTROL0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CONTROL0-NEXT: %tobool.not9 = icmp eq i32 %c, 0 +; CONTROL0-NEXT: br i1 %tobool.not9, label %while.end, label %while.body +; +; CONTROL0: while.body: ; preds = %while.body, %entry +; CONTROL0-NEXT: %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] +; CONTROL0-NEXT: %dec = add nsw i32 %c.addr.010, -1 +; CONTROL0-NEXT: %call = tail call i32 @_Z5gfuncv() +; CONTROL0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %add = add nsw i32 %0, %call +; CONTROL0-NEXT: store i32 %add, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %call1 = tail call i32 @_Z5gfuncv() +; CONTROL0-NEXT: %1 = load i8, i8* %tls_bitcast1, align 4, !tbaa !11 +; CONTROL0-NEXT: %2 = trunc i32 %call1 to i8 +; CONTROL0-NEXT: %conv5 = add i8 %1, %2 +; CONTROL0-NEXT: store i8 %conv5, i8* %tls_bitcast1, align 4, !tbaa !11 +; CONTROL0-NEXT: %call6 = tail call i32 @_Z5gfuncv() +; CONTROL0-NEXT: %3 = load i32, i32* %tls_bitcast2, align 4, !tbaa !13 +; CONTROL0-NEXT: %add7 = add nsw i32 %3, %call6 +; CONTROL0-NEXT: store i32 %add7, i32* %tls_bitcast2, align 4, !tbaa !13 +; CONTROL0-NEXT: %tobool.not = icmp eq i32 %dec, 0 +; CONTROL0-NEXT: br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !14 +; +; CONTROL0: while.end: ; preds = %while.body, %entry +; CONTROL0-NEXT: %4 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: ret i32 %4 +; CONTROL0-NEXT: } +entry: + %tobool.not9 = icmp eq i32 %c, 0 + br i1 %tobool.not9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.010, -1 + %call = tail call i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4, !tbaa !5 + %add = add nsw i32 %0, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !5 + %call1 = tail call i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !11 + %2 = trunc i32 %call1 to i8 + %conv5 = add i8 %1, %2 + store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !11 + %call6 = tail call i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !13 + %add7 = add nsw i32 %3, %call6 + store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !13 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !14 + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4, !tbaa !5 + ret i32 %4 +} + +declare i32 @_Z5gfuncv() local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f3i +; CONTROL0: entry: +; CONTROL0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +; CONTROL0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %call = tail call i32 @_Z6gfunc2i(i32 %0) +; CONTROL0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !5 +; CONTROL0-NEXT: %call1 = tail call i32 @_Z6gfunc2i(i32 %1) +; CONTROL0-NEXT: ret i32 1 +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !5 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !5 + %call1 = tail call i32 @_Z6gfunc2i(i32 %1) + ret i32 1 +} + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW6thl_x2() local_unnamed_addr #2 comdat { + ret i32* @thl_x2 +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{i32 1, !"tls-load-control", !"Optimize"} +!4 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = distinct !{!9, !10} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!12, !7, i64 0} +!12 = !{!"_ZTS2SS", !7, i64 0, !6, i64 4} +!13 = !{!12, !6, i64 4} +!14 = distinct !{!14, !10} Index: llvm/test/CodeGen/X86/tls-loads-control2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control2.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlscontrol --relocation-model=pic --tls-load-control=Optimize -o - %s | FileCheck %s --check-prefixes=CONTROL0 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlscontrol --relocation-model=pic --tls-load-control=Non-Optimize -o - %s | FileCheck %s --check-prefixes=CONTROL2 +; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlscontrol --relocation-model=pic -o - %s | FileCheck %s --check-prefixes=CONTROL2 + +$_ZTW5thl_x = comdat any + +@thl_x = thread_local global i32 0, align 4 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call = tail call i32 @_Z5gfunci(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z5gfunci(i32 %1) + ret i32 1 +} + +;CONTROL0-LABEL: _Z2f1i +;CONTROL0: entry: +;CONTROL0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32* +;CONTROL0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4, !tbaa !4 +;CONTROL0-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;CONTROL0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4, !tbaa !4 +;CONTROL0-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;CONTROL0-NEXT: ret i32 1 + +;CONTROL2-LABEL: _Z2f1i +;CONTROL2: entry: +;CONTROL2-NEXT: %0 = load i32, i32* @thl_x, align 4, !tbaa !4 +;CONTROL2-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0) +;CONTROL2-NEXT: %1 = load i32, i32* @thl_x, align 4, !tbaa !4 +;CONTROL2-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1) +;CONTROL2-NEXT: ret i32 1 + +declare i32 @_Z5gfunci(i32) local_unnamed_addr #1 + +; Function Attrs: uwtable +define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat { + ret i32* @thl_x +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} Index: llvm/test/CodeGen/X86/tls-loads-control3.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tls-loads-control3.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-control=Optimize -o - %s | FileCheck %s --check-prefixes=CONTROL0 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-control=Non-Optimize -o - %s | FileCheck %s --check-prefixes=CONTROL2 +; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic -o - %s | FileCheck %s --check-prefixes=CONTROL2 + +; This test has no module flag {"tls-load-control", i32 0}, so use --tls-load-control=x +; to choose the way of loading thread_local address. + +; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with: +; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm) + +$_ZTW5thl_x = comdat any + +$_ZTW6thl_x2 = comdat any + +@thl_x = thread_local global i32 0, align 4 +@thl_x2 = thread_local global i32 0, align 4 +@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4 +@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4 + +; For CONTROL0, check call __tls_get_addr@PLT only one time for each thread_local variable. +; For CONTROL2, Check the default way: usually call __tls_get_addr@PLT every time when use thread_local variable. + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f1i: +; CONTROL0: # %bb.0: # %entry +; CONTROL0-NEXT: pushq %r15 +; CONTROL0-NEXT: .cfi_def_cfa_offset 16 +; CONTROL0-NEXT: pushq %r14 +; CONTROL0-NEXT: .cfi_def_cfa_offset 24 +; CONTROL0-NEXT: pushq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 32 +; CONTROL0-NEXT: .cfi_offset %rbx, -32 +; CONTROL0-NEXT: .cfi_offset %r14, -24 +; CONTROL0-NEXT: .cfi_offset %r15, -16 +; CONTROL0-NEXT: movl %edi, %ebx +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: rex64 +; CONTROL0-NEXT: callq __tls_get_addr@PLT +; CONTROL0-NEXT: movq %rax, %r14 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: rex64 +; CONTROL0-NEXT: callq __tls_get_addr@PLT +; CONTROL0-NEXT: movq %rax, %r15 +; CONTROL0-NEXT: testl %ebx, %ebx +; CONTROL0-NEXT: je .LBB0_1 +; CONTROL0-NEXT: .p2align 4, 0x90 +; CONTROL0-NEXT: .LBB0_2: # %while.body +; CONTROL0-NEXT: # =>This Inner Loop Header: Depth=1 +; CONTROL0-NEXT: movl (%r14), %edi +; CONTROL0-NEXT: callq _Z6gfunc2i@PLT +; CONTROL0-NEXT: addl (%r15), %eax +; CONTROL0-NEXT: movl %eax, (%r15) +; CONTROL0-NEXT: decl %ebx +; CONTROL0-NEXT: jne .LBB0_2 +; CONTROL0-NEXT: jmp .LBB0_3 +; CONTROL0-NEXT: .LBB0_1: # %entry.while.end_crit_edge +; CONTROL0-NEXT: movl (%r15), %eax +; CONTROL0-NEXT: .LBB0_3: # %while.end +; CONTROL0-NEXT: popq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 24 +; CONTROL0-NEXT: popq %r14 +; CONTROL0-NEXT: .cfi_def_cfa_offset 16 +; CONTROL0-NEXT: popq %r15 +; CONTROL0-NEXT: .cfi_def_cfa_offset 8 +; CONTROL0-NEXT: retq +; +; CONTROL2-LABEL: _Z2f1i: +; CONTROL2: # %bb.0: # %entry +; CONTROL2-NEXT: pushq %rbp +; CONTROL2-NEXT: .cfi_def_cfa_offset 16 +; CONTROL2-NEXT: pushq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 24 +; CONTROL2-NEXT: pushq %rax +; CONTROL2-NEXT: .cfi_def_cfa_offset 32 +; CONTROL2-NEXT: .cfi_offset %rbx, -24 +; CONTROL2-NEXT: .cfi_offset %rbp, -16 +; CONTROL2-NEXT: testl %edi, %edi +; CONTROL2-NEXT: je .LBB0_4 +; CONTROL2-NEXT: # %bb.1: +; CONTROL2-NEXT: movl %edi, %ebx +; CONTROL2-NEXT: .p2align 4, 0x90 +; CONTROL2-NEXT: .LBB0_2: # %while.body +; CONTROL2-NEXT: # =>This Inner Loop Header: Depth=1 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x2@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: movl (%rax), %edi +; CONTROL2-NEXT: callq _Z6gfunc2i@PLT +; CONTROL2-NEXT: movl %eax, %ebp +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: addl (%rax), %ebp +; CONTROL2-NEXT: movl %ebp, (%rax) +; CONTROL2-NEXT: decl %ebx +; CONTROL2-NEXT: jne .LBB0_2 +; CONTROL2-NEXT: jmp .LBB0_3 +; CONTROL2-NEXT: .LBB0_4: # %entry.while.end_crit_edge +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: movl (%rax), %ebp +; CONTROL2-NEXT: .LBB0_3: # %while.end +; CONTROL2-NEXT: movl %ebp, %eax +; CONTROL2-NEXT: addq $8, %rsp +; CONTROL2-NEXT: .cfi_def_cfa_offset 24 +; CONTROL2-NEXT: popq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 16 +; CONTROL2-NEXT: popq %rbp +; CONTROL2-NEXT: .cfi_def_cfa_offset 8 +; CONTROL2-NEXT: retq +entry: + %tobool.not3 = icmp eq i32 %c, 0 + br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body + +entry.while.end_crit_edge: ; preds = %entry + %.pre = load i32, i32* @thl_x, align 4, !tbaa !4 + br label %while.end + +while.body: ; preds = %entry, %while.body + %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.04, -1 + %0 = load i32, i32* @thl_x2, align 4, !tbaa !4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %add = add nsw i32 %1, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !4 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !8 + +while.end: ; preds = %while.body, %entry.while.end_crit_edge + %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ] + ret i32 %2 +} + +declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f2i: +; CONTROL0: # %bb.0: # %entry +; CONTROL0-NEXT: pushq %r15 +; CONTROL0-NEXT: .cfi_def_cfa_offset 16 +; CONTROL0-NEXT: pushq %r14 +; CONTROL0-NEXT: .cfi_def_cfa_offset 24 +; CONTROL0-NEXT: pushq %r12 +; CONTROL0-NEXT: .cfi_def_cfa_offset 32 +; CONTROL0-NEXT: pushq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 40 +; CONTROL0-NEXT: pushq %rax +; CONTROL0-NEXT: .cfi_def_cfa_offset 48 +; CONTROL0-NEXT: .cfi_offset %rbx, -40 +; CONTROL0-NEXT: .cfi_offset %r12, -32 +; CONTROL0-NEXT: .cfi_offset %r14, -24 +; CONTROL0-NEXT: .cfi_offset %r15, -16 +; CONTROL0-NEXT: movl %edi, %ebx +; CONTROL0-NEXT: leaq _ZZ2f2iE2st.1@TLSLD(%rip), %rdi +; CONTROL0-NEXT: callq __tls_get_addr@PLT +; CONTROL0-NEXT: movq %rax, %r15 +; CONTROL0-NEXT: movq %rax, %r12 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: rex64 +; CONTROL0-NEXT: callq __tls_get_addr@PLT +; CONTROL0-NEXT: movq %rax, %r14 +; CONTROL0-NEXT: testl %ebx, %ebx +; CONTROL0-NEXT: je .LBB1_3 +; CONTROL0-NEXT: # %bb.1: # %while.body.preheader +; CONTROL0-NEXT: leaq _ZZ2f2iE2st.1@DTPOFF(%r15), %r15 +; CONTROL0-NEXT: leaq _ZZ2f2iE2st.0@DTPOFF(%r12), %r12 +; CONTROL0-NEXT: .p2align 4, 0x90 +; CONTROL0-NEXT: .LBB1_2: # %while.body +; CONTROL0-NEXT: # =>This Inner Loop Header: Depth=1 +; CONTROL0-NEXT: callq _Z5gfuncv@PLT +; CONTROL0-NEXT: addl %eax, (%r14) +; CONTROL0-NEXT: callq _Z5gfuncv@PLT +; CONTROL0-NEXT: addb %al, (%r12) +; CONTROL0-NEXT: callq _Z5gfuncv@PLT +; CONTROL0-NEXT: addl %eax, (%r15) +; CONTROL0-NEXT: decl %ebx +; CONTROL0-NEXT: jne .LBB1_2 +; CONTROL0-NEXT: .LBB1_3: # %while.end +; CONTROL0-NEXT: movl (%r14), %eax +; CONTROL0-NEXT: addq $8, %rsp +; CONTROL0-NEXT: .cfi_def_cfa_offset 40 +; CONTROL0-NEXT: popq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 32 +; CONTROL0-NEXT: popq %r12 +; CONTROL0-NEXT: .cfi_def_cfa_offset 24 +; CONTROL0-NEXT: popq %r14 +; CONTROL0-NEXT: .cfi_def_cfa_offset 16 +; CONTROL0-NEXT: popq %r15 +; CONTROL0-NEXT: .cfi_def_cfa_offset 8 +; CONTROL0-NEXT: retq +; +; CONTROL2-LABEL: _Z2f2i: +; CONTROL2: # %bb.0: # %entry +; CONTROL2-NEXT: pushq %rbp +; CONTROL2-NEXT: .cfi_def_cfa_offset 16 +; CONTROL2-NEXT: pushq %r14 +; CONTROL2-NEXT: .cfi_def_cfa_offset 24 +; CONTROL2-NEXT: pushq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 32 +; CONTROL2-NEXT: .cfi_offset %rbx, -32 +; CONTROL2-NEXT: .cfi_offset %r14, -24 +; CONTROL2-NEXT: .cfi_offset %rbp, -16 +; CONTROL2-NEXT: testl %edi, %edi +; CONTROL2-NEXT: je .LBB1_3 +; CONTROL2-NEXT: # %bb.1: # %while.body.preheader +; CONTROL2-NEXT: movl %edi, %ebx +; CONTROL2-NEXT: .p2align 4, 0x90 +; CONTROL2-NEXT: .LBB1_2: # %while.body +; CONTROL2-NEXT: # =>This Inner Loop Header: Depth=1 +; CONTROL2-NEXT: callq _Z5gfuncv@PLT +; CONTROL2-NEXT: movl %eax, %ebp +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: addl %ebp, (%rax) +; CONTROL2-NEXT: callq _Z5gfuncv@PLT +; CONTROL2-NEXT: movl %eax, %ebp +; CONTROL2-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: movq %rax, %r14 +; CONTROL2-NEXT: addb %bpl, _ZZ2f2iE2st.0@DTPOFF(%rax) +; CONTROL2-NEXT: callq _Z5gfuncv@PLT +; CONTROL2-NEXT: movl %eax, %ecx +; CONTROL2-NEXT: movq %r14, %rax +; CONTROL2-NEXT: addl %ecx, _ZZ2f2iE2st.1@DTPOFF(%r14) +; CONTROL2-NEXT: decl %ebx +; CONTROL2-NEXT: jne .LBB1_2 +; CONTROL2-NEXT: .LBB1_3: # %while.end +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: movl (%rax), %eax +; CONTROL2-NEXT: popq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 24 +; CONTROL2-NEXT: popq %r14 +; CONTROL2-NEXT: .cfi_def_cfa_offset 16 +; CONTROL2-NEXT: popq %rbp +; CONTROL2-NEXT: .cfi_def_cfa_offset 8 +; CONTROL2-NEXT: retq +entry: + %tobool.not9 = icmp eq i32 %c, 0 + br i1 %tobool.not9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ] + %dec = add nsw i32 %c.addr.010, -1 + %call = tail call i32 @_Z5gfuncv() + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %add = add nsw i32 %0, %call + store i32 %add, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z5gfuncv() + %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !10 + %2 = trunc i32 %call1 to i8 + %conv5 = add i8 %1, %2 + store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4, !tbaa !10 + %call6 = tail call i32 @_Z5gfuncv() + %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !12 + %add7 = add nsw i32 %3, %call6 + store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4, !tbaa !12 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body, !llvm.loop !13 + +while.end: ; preds = %while.body, %entry + %4 = load i32, i32* @thl_x, align 4, !tbaa !4 + ret i32 %4 +} + +declare i32 @_Z5gfuncv() local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 { +; CONTROL0-LABEL: _Z2f3i: +; CONTROL0: # %bb.0: # %entry +; CONTROL0-NEXT: pushq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 16 +; CONTROL0-NEXT: .cfi_offset %rbx, -16 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: data16 +; CONTROL0-NEXT: rex64 +; CONTROL0-NEXT: callq __tls_get_addr@PLT +; CONTROL0-NEXT: movq %rax, %rbx +; CONTROL0-NEXT: movl (%rax), %edi +; CONTROL0-NEXT: callq _Z6gfunc2i@PLT +; CONTROL0-NEXT: movl (%rbx), %edi +; CONTROL0-NEXT: callq _Z6gfunc2i@PLT +; CONTROL0-NEXT: movl $1, %eax +; CONTROL0-NEXT: popq %rbx +; CONTROL0-NEXT: .cfi_def_cfa_offset 8 +; CONTROL0-NEXT: retq +; +; CONTROL2-LABEL: _Z2f3i: +; CONTROL2: # %bb.0: # %entry +; CONTROL2-NEXT: pushq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 16 +; CONTROL2-NEXT: .cfi_offset %rbx, -16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: leaq thl_x@TLSGD(%rip), %rdi +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: data16 +; CONTROL2-NEXT: rex64 +; CONTROL2-NEXT: callq __tls_get_addr@PLT +; CONTROL2-NEXT: movq %rax, %rbx +; CONTROL2-NEXT: movl (%rax), %edi +; CONTROL2-NEXT: callq _Z6gfunc2i@PLT +; CONTROL2-NEXT: movl (%rbx), %edi +; CONTROL2-NEXT: callq _Z6gfunc2i@PLT +; CONTROL2-NEXT: movl $1, %eax +; CONTROL2-NEXT: popq %rbx +; CONTROL2-NEXT: .cfi_def_cfa_offset 8 +; CONTROL2-NEXT: retq +entry: + %0 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call = tail call i32 @_Z6gfunc2i(i32 %0) + %1 = load i32, i32* @thl_x, align 4, !tbaa !4 + %call1 = tail call i32 @_Z6gfunc2i(i32 %1) + ret i32 1 +} + +attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 7, !"uwtable", i32 1} +!3 = !{!"Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.x.0.YYYYMMDD)"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!11, !6, i64 0} +!11 = !{!"_ZTS2SS", !6, i64 0, !5, i64 4} +!12 = !{!11, !5, i64 4} +!13 = distinct !{!13, !9} Index: llvm/tools/llc/llc.cpp =================================================================== --- llvm/tools/llc/llc.cpp +++ llvm/tools/llc/llc.cpp @@ -369,6 +369,7 @@ initializeHardwareLoopsPass(*Registry); initializeTransformUtils(*Registry); initializeReplaceWithVeclibLegacyPass(*Registry); + initializeTLSVariableControlLegacyPassPass(*Registry); // Initialize debugging passes. initializeScavengerTestPass(*Registry);