diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3241,8 +3241,10 @@ FunctionCallee HardwareTidFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_hardware_thread_id_in_block); - Value *Tid = + CallInst *Tid = OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {}); + if (Function *Fn = dyn_cast(HardwareTidFn.getCallee())) + Tid->setCallingConv(Fn->getCallingConv()); Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid); OMPInfoCache.OMPBuilder.Builder .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB) @@ -3255,8 +3257,11 @@ M, OMPRTL___kmpc_barrier_simple_spmd); OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy( RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt())); - OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}) - ->setDebugLoc(DL); + CallInst *Barrier = + OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}); + Barrier->setDebugLoc(DL); + if (Function *Fn = dyn_cast(BarrierFn.getCallee())) + Barrier->setCallingConv(Fn->getCallingConv()); // Second barrier ensures workers have read broadcast values. if (HasBroadcastValues) diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -1430,7 +1430,7 @@ ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32* ; AMDGPU-NEXT: br label [[REGION_CHECK_TID:%.*]] ; AMDGPU: region.check.tid: -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; AMDGPU-NEXT: [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block() ; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; AMDGPU: region.guarded: @@ -1466,7 +1466,7 @@ ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* ; NVPTX-NEXT: br label [[REGION_CHECK_TID:%.*]] ; NVPTX: region.check.tid: -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; NVPTX-NEXT: [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block() ; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; NVPTX: region.guarded: @@ -2328,6 +2328,8 @@ ret void } +declare fastcc i32 @__kmpc_get_hardware_thread_id_in_block(); + attributes #0 = { alwaysinline convergent norecurse nounwind } attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn } attributes #2 = { convergent }