diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2614,7 +2614,12 @@ *VD, CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false)), getContext().getDeclAlign(VD)); - // No other cases for now. + // add to ForeignStaticVarDecls if this is a thread-local variable + // declared in a different function + const Decl *DC = cast(VD->getDeclContext()); + if (DC != CurGD.getDecl() && VD->getTLSKind() == VarDecl::TLS_Dynamic) + ForeignStaticTLSVars.insert(VD); + // No other cases for now. } else { llvm_unreachable("DeclRefExpr for Decl not entered in LocalDeclMap?"); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -467,6 +467,10 @@ /// should emit cleanups. bool CurFuncIsThunk = false; + /// static thread-local variables we've referenced that were declared in a + /// parent function. + llvm::SmallSet ForeignStaticTLSVars; + /// In ARC, whether we should autorelease the return value. bool AutoreleaseResult = false; diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -31,6 +31,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Intrinsics.h" @@ -295,6 +296,37 @@ delete BB; } +/// If a variable has an initializer, list all of the +/// variables that it depends on. +static llvm::SmallSet +enumerateVarInitDependencies(const VarDecl *VD) { + llvm::SmallSet deps; + + if (const auto *InitExpr = VD->getInit()) { + std::deque frontier; + + for (const auto *s : InitExpr->children()) + frontier.emplace_back(s); + + while (!frontier.empty()) { + auto x = frontier.front(); + frontier.pop_front(); + if (x->getStmtClass() == clang::Stmt::DeclRefExprClass) { + if (const auto *V = dyn_cast(cast(x)->getDecl())) { + deps.insert(V); + auto V_Refs = enumerateVarInitDependencies(V); + deps.insert(V_Refs.begin(), V_Refs.end()); + } + } else { + for (const auto *s : x->children()) + frontier.emplace_back(s); + } + } + } + + return deps; +} + void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { assert(BreakContinueStack.empty() && "mismatched push/pop in break/continue stack!"); @@ -384,6 +416,98 @@ CGBuilderTy(*this, AllocaInsertPt).CreateCall(FrameEscapeFn, EscapeArgs); } + // Emit initializers for static local variables that we referenced that are + // declared in another function, which may be uninitialized on entry if this + // function may execute on a separate thread. For example, when we're + // emitting the lambda in the following code: + // + // class Object { + // int init; + // Object() : init(1) {} + // }; + // + // main() { + // static thread_local Object var; + // std::thread([] { + // ...emit initializer for var here... + // }); + // } + // + // or another example: + // + // main() { + // static Object var; + // #pragma omp threadprivate(var) + // #pragma omp parallel + // { + // ...emit initializer for var here... + // } + // } + llvm::SmallSet UniqueVarsToInit; + llvm::SmallVector OrderedVarInits; + llvm::DenseMap> VarInitDependencies; + + for (const VarDecl *VD : ForeignStaticTLSVars) { + llvm::SmallVector Frontier = {VD}; + + // don't initialize dependencies of CUDA __shared__ var with initializer? + bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + VD->hasAttr(); + while (!isCudaSharedVar && !Frontier.empty()) { + auto *Child = Frontier.pop_back_val(); + + if (UniqueVarsToInit.find(Child) == UniqueVarsToInit.end()) { + for (auto *N : enumerateVarInitDependencies(Child)) { + Frontier.push_back(N); + VarInitDependencies[Child].insert(N); + } + UniqueVarsToInit.insert(Child); + } + } + } + + for (const VarDecl *VD : UniqueVarsToInit) + OrderedVarInits.push_back(VD); + + llvm::sort(OrderedVarInits, + [&VarInitDependencies](const VarDecl *a, const VarDecl *b) { + auto a_deps = VarInitDependencies[a]; + // A < B iff (B \in VarInitDependencies(A) OR B comes before A) + // the order is reversed because our codegen reverses the order of + // initializers + return a_deps.find(b) != a_deps.end() + || b->getLocation() < a->getLocation(); // ensure deterministic ordering + }); + + for (const VarDecl *VD : OrderedVarInits) { + // CUDA's local and local static __shared__ variables should not + // have any non-empty initializers. This is ensured by Sema. + // Whatever initializer such variable may have when it gets here is + // a no-op and should not be emitted. + bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + VD->hasAttr(); + // If this value has an initializer, and it's thread-local, emit it. + if (VD->getInit() && !isCudaSharedVar) { + auto *GV = dyn_cast(CGM.getStaticLocalDeclAddress(VD)); + auto IP = Builder.saveAndClearIP(); + llvm::BasicBlock *BBParent = AllocaInsertPt->getParent(); + llvm::Instruction *INext = AllocaInsertPt->getNextNonDebugInstruction(); + llvm::BasicBlock *BBNext = BBParent->splitBasicBlock(INext, BBParent->getName() + ".next"); + + INext = AllocaInsertPt->getNextNonDebugInstruction(); + + Builder.SetInsertPoint(BBParent); + // the global variable shouldn't change, as this function should've + // been called first when generating the parent function + AddInitializerToStaticVarDecl(*VD, GV); + if (INext != BBParent->getTerminator()) { + INext->eraseFromParent(); + Builder.CreateBr(BBNext); + } + Builder.restoreIP(IP); + } + } + // Remove the AllocaInsertPt instruction, which is just a convenience for us. llvm::Instruction *Ptr = AllocaInsertPt; AllocaInsertPt = nullptr; diff --git a/clang/test/CodeGenCXX/cxx11-thread-local.cpp b/clang/test/CodeGenCXX/cxx11-thread-local.cpp --- a/clang/test/CodeGenCXX/cxx11-thread-local.cpp +++ b/clang/test/CodeGenCXX/cxx11-thread-local.cpp @@ -268,6 +268,33 @@ return this->n; } +namespace static_tls_in_lambda { + struct X { + X() {} + }; + + + X (*f())() { + static thread_local X x; + + return [] { return x; }; + } + + auto y = f(); + + void g() { y(); } + + void bar(X**, X**, X**); + void baz(void()); + void f2() { + thread_local X x; + thread_local X* p = &x; + thread_local X* q = p; + thread_local X* r = q; + baz([]{bar(&p, &q, &r);}); + } +} + namespace { thread_local int anon_i{1}; } @@ -303,6 +330,42 @@ // CHECK: store i64 1, i64* @_ZGVN1XIiE1mE // CHECK: br label +// CHECK: define internal void @"_ZZN20static_tls_in_lambda1fEvENK3$_1clEv" +// CHECK: %[[static_tls_guard_val:.*]] = load i8, i8* @_ZGVZN20static_tls_in_lambda1fEvE1x +// CHECK: %[[static_tls_guard_init:.*]] = icmp eq i8 %[[static_tls_guard_val]], 0 +// CHECK: br i1 %[[static_tls_guard_init]], +// CHECK: call void @_ZN20static_tls_in_lambda1XC1Ev(%"struct.static_tls_in_lambda::X"* @_ZZN20static_tls_in_lambda1fEvE1x) +// CHECK: store i8 1, i8* @_ZGVZN20static_tls_in_lambda1fEvE1x, align 1 + +// CHECK: define internal void @"_ZZN20static_tls_in_lambda2f2EvENK3$_2clEv" +// init x +// CHECK: %[[static_tls_guard_val:.*]] = load i8, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1x +// CHECK: %[[static_tls_guard_init:.*]] = icmp eq i8 %[[static_tls_guard_val]], 0 +// CHECK: br i1 %[[static_tls_guard_init]], +// CHECK: call void @_ZN20static_tls_in_lambda1XC1Ev(%"struct.static_tls_in_lambda::X"* @_ZZN20static_tls_in_lambda2f2EvE1x) +// CHECK: store i8 1, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1x, align 1 +// init p +// CHECK: %[[static_tls_guard_val:.*]] = load i8, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1p +// CHECK: %[[static_tls_guard_init:.*]] = icmp eq i8 %[[static_tls_guard_val]], 0 +// CHECK: br i1 %[[static_tls_guard_init]], +// CHECK: store %"struct.static_tls_in_lambda::X"* @_ZZN20static_tls_in_lambda2f2EvE1x, %"struct.static_tls_in_lambda::X"** @_ZZN20static_tls_in_lambda2f2EvE1p +// CHECK: store i8 1, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1p, align 1 +// init q +// CHECK: %[[static_tls_guard_val:.*]] = load i8, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1q +// CHECK: %[[static_tls_guard_init:.*]] = icmp eq i8 %[[static_tls_guard_val]], 0 +// CHECK: br i1 %[[static_tls_guard_init]], +// CHECK: %[[static_tls_var_prev:.*]] = load %"struct.static_tls_in_lambda::X"*, %"struct.static_tls_in_lambda::X"** @_ZZN20static_tls_in_lambda2f2EvE1p, align 8 +// CHECK: store %"struct.static_tls_in_lambda::X"* %[[static_tls_var_prev]], %"struct.static_tls_in_lambda::X"** @_ZZN20static_tls_in_lambda2f2EvE1q, align 8 +// CHECK: store i8 1, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1q, align 1 +// init r +// CHECK: %[[static_tls_guard_val:.*]] = load i8, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1r +// CHECK: %[[static_tls_guard_init:.*]] = icmp eq i8 %[[static_tls_guard_val]], 0 +// CHECK: br i1 %[[static_tls_guard_init]], +// CHECK: %[[static_tls_var_prev:.*]] = load %"struct.static_tls_in_lambda::X"*, %"struct.static_tls_in_lambda::X"** @_ZZN20static_tls_in_lambda2f2EvE1q, align 8 +// CHECK: store %"struct.static_tls_in_lambda::X"* %[[static_tls_var_prev]], %"struct.static_tls_in_lambda::X"** @_ZZN20static_tls_in_lambda2f2EvE1r, align 8 +// CHECK: store i8 1, i8* @_ZGVZN20static_tls_in_lambda2f2EvE1r, align 1 + + // CHECK: define {{.*}}@[[GLOBAL_INIT:.*]]() // CHECK: call void @[[C_INIT]]() // CHECK: call void @[[E_INIT]]()