Index: clang/lib/CodeGen/CGExpr.cpp =================================================================== --- clang/lib/CodeGen/CGExpr.cpp +++ clang/lib/CodeGen/CGExpr.cpp @@ -2604,6 +2604,15 @@ } llvm::Value *V = CGF.CGM.GetAddrOfGlobalVar(VD); + + if (VD->getTLSKind() != VarDecl::TLS_None && + // We only use @llvm.threadlocal.address if opaque pointers enabled. + // Otherwise, we need to pay for many unnecessary bitcasts. + // + // TODO: Remove this condition once we support opaque pointers only. + CGF.CGM.getCodeGenOpts().OpaquePointers) + V = CGF.Builder.CreateThreadLocalAddress(V); + llvm::Type *RealVarTy = CGF.getTypes().ConvertTypeForMem(VD->getType()); V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy); CharUnits Alignment = CGF.getContext().getDeclAlign(VD); @@ -2873,6 +2882,12 @@ llvm_unreachable("DeclRefExpr for Decl not entered in LocalDeclMap?"); } + // Handle threadlocal function locals. + if (VD->getTLSKind() != VarDecl::TLS_None && + CGM.getCodeGenOpts().OpaquePointers) { + auto *var = Builder.CreateThreadLocalAddress(addr.getPointer()); + addr = Address(var, addr.getElementType(), addr.getAlignment()); + } // Check for OpenMP threadprivate variables. if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd && Index: clang/lib/CodeGen/ItaniumCXXABI.cpp =================================================================== --- clang/lib/CodeGen/ItaniumCXXABI.cpp +++ clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -2990,6 +2990,11 @@ if (Val->getType() != Wrapper->getReturnType()) Val = Builder.CreatePointerBitCastOrAddrSpaceCast( Val, Wrapper->getReturnType(), ""); + + // TODO: Remove this condition once we support opaque pointers only. + if (CGM.getCodeGenOpts().OpaquePointers) + Val = Builder.CreateThreadLocalAddress(Val); + Builder.CreateRet(Val); } } Index: clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp =================================================================== --- clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp +++ clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp @@ -17,7 +17,8 @@ // CHECK-LABEL: define weak_odr hidden {{.*}} @_ZTWN3TLSI1SE5mDataE() {{.*}} comdat { // CHECK: call void @_ZTHN3TLSI1SE5mDataE() -// CHECK: ret {{.*}} @_ZN3TLSI1SE5mDataE +// CHECK: [[TLSmData_ADDR:%[^ ]+]] = call ptr @llvm.threadlocal.address(ptr @_ZN3TLSI1SE5mDataE) +// CHECK: ret {{.*}} [[TLSmData_ADDR]] // Unlike for a global, the global initialization function must not be in a // COMDAT with the variable, because it is referenced from the _ZTH function Index: clang/test/CodeGenCXX/pr18635.cpp =================================================================== --- clang/test/CodeGenCXX/pr18635.cpp +++ clang/test/CodeGenCXX/pr18635.cpp @@ -4,7 +4,9 @@ // CHECK: [[X_GLOBAL:@[^ ]+]]{{.*}}thread_local global // returned somewhere in TLS wrapper: -// CHECK: ret{{.*}}[[X_GLOBAL]] +// CHECK: define {{.+}} ptr @_ZTW1x( +// CHECK: [[X_GLOBAL_ADDR:%[^ ]+]] = call ptr @llvm.threadlocal.address(ptr [[X_GLOBAL]]) +// CHECK: ret{{.*}}[[X_GLOBAL_ADDR]] template class unique_ptr { template struct pair { Index: clang/test/CodeGenCXX/threadlocal_address.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenCXX/threadlocal_address.cpp @@ -0,0 +1,54 @@ +// Test that the use of thread local variables would be wrapped by @llvm.threadlocal.address intrinsics. +// RUN: %clang_cc1 -std=c++11 -emit-llvm -triple %itanium_abi_triple -o - %s -disable-llvm-passes | FileCheck %s +// RUN: %clang_cc1 -std=c++11 -emit-llvm -triple %itanium_abi_triple -o - -O1 %s | FileCheck %s -check-prefix=CHECK-O1 +thread_local int i; +int g() { + i++; + return i; +} +// CHECK: @i = thread_local global i32 0 +// CHECK: @_ZZ1fvE1j = internal thread_local global i32 0 +// +// CHECK: @_Z1gv() +// CHECK-NEXT: entry +// CHECK-NEXT: %[[IA:.+]] = call ptr @llvm.threadlocal.address(ptr @i) +// CHECK-NEXT: %[[VA:.+]] = load i32, ptr %[[IA]] +// CHECK-NEXT: %[[INC:.+]] = add nsw i32 %[[VA]], 1 +// CHECK-NEXT: store i32 %[[INC]], ptr %[[IA]], align 4 +// CHECK-NEXT: %[[IA2:.+]] = call ptr @llvm.threadlocal.address(ptr @i) +// CHECK-NEXT: %[[RET:.+]] = load i32, ptr %[[IA2]], align 4 +// CHECK-NEXT: ret i32 %[[RET]] +// +// CHECK: declare ptr @llvm.threadlocal.address(ptr) #[[ATTR_NUM:.+]] +// +// CHECK-O1-LABEL: @_Z1gv +// CHECK-O1-NEXT: entry: +// CHECK-O1-NEXT: %[[I_ADDR:.+]] = call ptr @llvm.threadlocal.address(ptr nonnull @i) +// CHECK-O1-NEXT: %[[VAL:.+]] = load i32, ptr %[[I_ADDR]] +// CHECK-O1-NEXT: %[[INC:.+]] = add nsw i32 %[[VAL]], 1 +// CHECK-O1-NEXT: store i32 %[[INC]], ptr %[[I_ADDR]] +// CHECK-O1-NEXT: ret i32 %[[INC]] +int f() { + thread_local int j = 0; + j++; + return j; +} +// CHECK: @_Z1fv() +// CHECK-NEXT: entry +// CHECK-NEXT: %[[JA:.+]] = call ptr @llvm.threadlocal.address(ptr @_ZZ1fvE1j) +// CHECK-NEXT: %[[VA:.+]] = load i32, ptr %[[JA]] +// CHECK-NEXT: %[[INC:.+]] = add nsw i32 %[[VA]], 1 +// CHECK-NEXT: store i32 %[[INC]], ptr %[[IA]], align 4 +// CHECK-NEXT: %[[JA2:.+]] = call ptr @llvm.threadlocal.address(ptr @_ZZ1fvE1j) +// CHECK-NEXT: %[[RET:.+]] = load i32, ptr %[[JA2]], align 4 +// CHECK-NEXT: ret i32 %[[RET]] +// +// CHECK-O1-LABEL: @_Z1fv +// CHECK-O1-NEXT: entry: +// CHECK-O1-NEXT: %[[J_ADDR:.+]] = call ptr @llvm.threadlocal.address(ptr nonnull @_ZZ1fvE1j) +// CHECK-O1-NEXT: %[[VAL:.+]] = load i32, ptr %[[J_ADDR]] +// CHECK-O1-NEXT: %[[INC:.+]] = add nsw i32 %[[VAL]], 1 +// CHECK-O1-NEXT: store i32 %[[INC]], ptr %[[J_ADDR]] +// CHECK-O1-NEXT: ret i32 %[[INC]] +// +// CHECK: attributes #[[ATTR_NUM]] = { nocallback nofree nosync nounwind readnone speculatable willreturn } Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -24402,6 +24402,37 @@ mask argument does not match the pointer size of the target, the mask is zero-extended or truncated accordingly. +.. _int_threadlocal_address: + +'``llvm.threadlocal.address``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare ptr @llvm.threadlocal.address(ptr) nounwind readnone willreturn + +Arguments: +"""""""""" + +The first argument is pointer, which refers to a thread local variable. + +Overview: +"""""""""" + +The LLVM treated the address of thread local variable as a constant expression. +But it is not true. The ``llvm.threadlocal.address`` intrinsic would represent +the address of the thread local variable. + +Semantics: +"""""""""" + +The address of a thread local variable is not a constant, since it depends on +the calling thread. The `llvm.threadlocal.address` intrinsic returns the +address of the given thread local variable in the calling thread. + .. _int_vscale: '``llvm.vscale``' Intrinsic Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -642,6 +642,7 @@ case Intrinsic::coro_align: case Intrinsic::coro_suspend: case Intrinsic::coro_subfn_addr: + case Intrinsic::threadlocal_address: // These intrinsics don't actually represent code after lowering. return 0; } Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -743,6 +743,9 @@ /// If the pointer isn't i8* it will be converted. CallInst *CreateInvariantStart(Value *Ptr, ConstantInt *Size = nullptr); + /// Create a call to llvm.threadlocal.address intrinsic. + CallInst *CreateThreadLocalAddress(Value *Ptr); + /// Create a call to Masked Load intrinsic CallInst *CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru = nullptr, const Twine &Name = ""); Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1390,6 +1390,10 @@ def int_ptrmask: DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_anyint_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +// Intrinsic to wrap a thread local variable. +def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + def int_experimental_stepvector : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [], [IntrNoMem]>; Index: llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp =================================================================== --- llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This pass implements IR lowering for the llvm.load.relative and llvm.objc.* -// intrinsics. +// This pass implements IR lowering for the llvm.threadlocal_address, +// llvm.load.relative and llvm.objc.* intrinsics. // //===----------------------------------------------------------------------===// @@ -128,6 +128,19 @@ return true; } +static bool lowerThreadLocalIntrinsics(Function &F) { + if (F.use_empty()) + return false; + + for (Use &U : llvm::make_early_inc_range(F.uses())) { + auto *CB = cast(U.getUser()); + CB->replaceAllUsesWith(CB->getOperand(0)); + CB->eraseFromParent(); + } + + return true; +} + static bool lowerIntrinsics(Module &M) { bool Changed = false; for (Function &F : M) { @@ -213,6 +226,9 @@ case Intrinsic::objc_sync_exit: Changed |= lowerObjCCall(F, "objc_sync_exit"); break; + case Intrinsic::threadlocal_address: + Changed |= lowerThreadLocalIntrinsics(F); + break; } } return Changed; Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -499,6 +499,13 @@ return createCallHelper(TheFn, Ops, this); } +CallInst *IRBuilderBase::CreateThreadLocalAddress(Value *Ptr) { + assert(isa(Ptr) && cast(Ptr)->isThreadLocal() && + "threadlocal_address only applies to thread local variables."); + return CreateIntrinsic(llvm::Intrinsic::threadlocal_address, llvm::None, + {Ptr}); +} + CallInst * IRBuilderBase::CreateAssumption(Value *Cond, ArrayRef OpBundles) { Index: llvm/test/Transforms/PreISelIntrinsicLowering/threadlocal_address.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/PreISelIntrinsicLowering/threadlocal_address.ll @@ -0,0 +1,25 @@ +; RUN: opt -pre-isel-intrinsic-lowering -opaque-pointers -S -o - < %s | FileCheck %s + +@i = thread_local global i32 0, align 4 + +define dso_local noundef i32 @foo() { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @i, align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; CHECK-NEXT: store i32 [[INC]], ptr @i, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @i, align 4 +; CHECK-NEXT: ret i32 [[TMP1]] +; +; CHECK-NOT: call{{.*}}@llvm.threadlocal.address( +entry: + %0 = call ptr @llvm.threadlocal.address(ptr @i) + %1 = load i32, ptr %0, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, ptr %0, align 4 + %2 = call ptr @llvm.threadlocal.address(ptr @i) + %3 = load i32, ptr %2, align 4 + ret i32 %3 +} + +declare ptr @llvm.threadlocal.address(ptr) nounwind readnone willreturn