diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1015,11 +1015,13 @@ /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. + /// \param NumThreads The value corresponding to the num_threads clause, if + /// any, or nullptr. /// virtual void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond); + const Expr *IfCond, llvm::Value *NumThreads); /// Emits a critical region. /// \param CriticalName Name of the critical region. @@ -1991,11 +1993,13 @@ /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. + /// \param NumThreads The value corresponding to the num_threads clause, if + /// any, or nullptr. /// void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond) override; + const Expr *IfCond, llvm::Value *NumThreads) override; /// Emits a critical region. /// \param CriticalName Name of the critical region. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -2085,7 +2085,8 @@ void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond) { + const Expr *IfCond, + llvm::Value *NumThreads) { if (!CGF.HaveInsertPoint()) return; llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); @@ -12890,7 +12891,8 @@ SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond) { + const Expr *IfCond, + llvm::Value *NumThreads) { llvm_unreachable("Not supported in SIMD-only mode"); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -257,10 +257,13 @@ /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. + /// \param NumThreads The value corresponding to the num_threads clause, if + /// any, + /// or nullptr. void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond) override; + const Expr *IfCond, llvm::Value *NumThreads) override; /// Emit an implicit/explicit barrier for OpenMP threads. /// \param Kind Directive for which this implicit barrier call must be diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1221,11 +1221,7 @@ void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) { - // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) - return; - - CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); + // Nothing to do. } void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, @@ -1510,13 +1506,16 @@ SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, - const Expr *IfCond) { + const Expr *IfCond, + llvm::Value *NumThreads) { if (!CGF.HaveInsertPoint()) return; - auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, - IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) { + auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, + NumThreads](CodeGenFunction &CGF, + PrePostActionTy &Action) { CGBuilderTy &Bld = CGF.Builder; + llvm::Value *NumThreadsVal = NumThreads; llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); if (WFn) @@ -1556,13 +1555,18 @@ else IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); - assert(IfCondVal && "Expected a value"); + if (!NumThreadsVal) + NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); + else + NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), + + assert(IfCondVal && "Expected a value"); llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); llvm::Value *Args[] = { RTLoc, getThreadID(CGF, Loc), IfCondVal, - llvm::ConstantInt::get(CGF.Int32Ty, -1), + NumThreadsVal, llvm::ConstantInt::get(CGF.Int32Ty, -1), FnPtr, ID, diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1557,14 +1557,14 @@ OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen, const CodeGenBoundParametersTy &CodeGenBoundParameters) { const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel); + llvm::Value *NumThreads = nullptr; llvm::Function *OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction( S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); if (const auto *NumThreadsClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); - llvm::Value *NumThreads = - CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), - /*IgnoreResultAssign=*/true); + NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), + /*IgnoreResultAssign=*/true); CGF.CGM.getOpenMPRuntime().emitNumThreadsClause( CGF, NumThreads, NumThreadsClause->getBeginLoc()); } @@ -1591,7 +1591,7 @@ CodeGenBoundParameters(CGF, S, CapturedVars); CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getBeginLoc(), OutlinedFn, - CapturedVars, IfCond); + CapturedVars, IfCond, NumThreads); } static bool isAllocatableDecl(const VarDecl *VD) { diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -46,12 +46,11 @@ // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2) // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i64 1) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i64 1) // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -65,12 +64,11 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 -// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2) // CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 // CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* // CHECK1-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 1) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 1) // CHECK1-NEXT: ret void // // @@ -179,12 +177,11 @@ // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2) // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -198,12 +195,11 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 -// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2) // CHECK2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* // CHECK2-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 // CHECK2-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1) // CHECK2-NEXT: ret void // // @@ -312,12 +308,11 @@ // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2) // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -331,12 +326,11 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 -// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2) // CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8* // CHECK3-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1) // CHECK3-NEXT: ret void // // diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -165,7 +165,7 @@ // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8* // CHECK1-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i64 2) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i64 2) // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -301,7 +301,7 @@ // CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[CONV19]], 1 // CHECK1-NEXT: [[CONV21:%.*]] = trunc i32 [[ADD20]] to i8 // CHECK1-NEXT: store i8 [[CONV21]], i8* [[Y]], align 8 -// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]] +// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]] // CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK1-NEXT: [[ADD22:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK1-NEXT: store i64 [[ADD22]], i64* [[CALL]], align 8 @@ -312,7 +312,7 @@ // // // CHECK1-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi -// CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 { +// CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 8 // CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -405,7 +405,7 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load double, double* [[A7]], align 8 // CHECK1-NEXT: [[CONV8:%.*]] = fptosi double [[TMP8]] to i32 // CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 -// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV8]], double* nonnull align 8 dereferenceable(8) [[A9]]) #[[ATTR6]] +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV8]], double* nonnull align 8 dereferenceable(8) [[A9]]) #[[ATTR8]] // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -413,7 +413,7 @@ // // // CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 @@ -443,7 +443,7 @@ // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: -// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]] // CHECK1-NEXT: unreachable // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -510,7 +510,7 @@ // // // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 @@ -552,7 +552,7 @@ // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8* // CHECK2-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -687,7 +687,7 @@ // CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 [[CONV18]], 1 // CHECK2-NEXT: [[CONV20:%.*]] = trunc i32 [[ADD19]] to i8 // CHECK2-NEXT: store i8 [[CONV20]], i8* [[Y]], align 8 -// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]] +// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]] // CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK2-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK2-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8 @@ -698,7 +698,7 @@ // // // CHECK2-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi -// CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 { +// CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4 // CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -789,7 +789,7 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load double, double* [[A6]], align 8 // CHECK2-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32 // CHECK2-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 -// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]] +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR8]] // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -797,7 +797,7 @@ // // // CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] { +// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 @@ -827,7 +827,7 @@ // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: -// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]] // CHECK2-NEXT: unreachable // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -893,7 +893,7 @@ // // // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 @@ -935,7 +935,7 @@ // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8* // CHECK3-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -1070,7 +1070,7 @@ // CHECK3-NEXT: [[ADD19:%.*]] = add nsw i32 [[CONV18]], 1 // CHECK3-NEXT: [[CONV20:%.*]] = trunc i32 [[ADD19]] to i8 // CHECK3-NEXT: store i8 [[CONV20]], i8* [[Y]], align 8 -// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]] +// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]] // CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK3-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK3-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8 @@ -1081,7 +1081,7 @@ // // // CHECK3-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi -// CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 { +// CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4 // CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -1172,7 +1172,7 @@ // CHECK3-NEXT: [[TMP8:%.*]] = load double, double* [[A6]], align 8 // CHECK3-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32 // CHECK3-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 -// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]] +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR8]] // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -1180,7 +1180,7 @@ // // // CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] { +// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 @@ -1210,7 +1210,7 @@ // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: -// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]] // CHECK3-NEXT: unreachable // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1276,7 +1276,7 @@ // // // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -62,7 +62,7 @@ // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -119,7 +119,7 @@ // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -173,7 +173,7 @@ // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -229,7 +229,7 @@ // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -283,7 +283,7 @@ // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -339,7 +339,7 @@ // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -393,7 +393,7 @@ // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) // CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: @@ -450,7 +450,7 @@ // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) // CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: @@ -504,7 +504,7 @@ // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK5-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) // CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: @@ -560,7 +560,7 @@ // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) // CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: @@ -614,7 +614,7 @@ // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8* // CHECK6-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) // CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: @@ -670,7 +670,7 @@ // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* // CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) // CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -295,8 +295,6 @@ /// TODO uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); -/// TODO -void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads); ///} /// Tasking diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -212,11 +212,6 @@ return omp_get_thread_num(); } -void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) { - FunctionTracingRAII(); - icv::NThreads = NumThreads; -} - void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, int32_t thread_limit) { FunctionTracingRAII(); diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -181,10 +181,6 @@ topTaskDescr[tid] = taskICV; } INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; - // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } // schedule (for dispatch) INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } INLINE int64_t &Chunk(int tid) { return chunk[tid]; } @@ -204,11 +200,6 @@ omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; // pointer where to find the current task ICV (top of the stack) omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - union { - // Only one of the two is live at the same time. - // parallel - uint16_t tnum[MAX_THREADS_PER_TEAM]; - } nextRegion; // schedule (for dispatch) kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for int64_t chunk[MAX_THREADS_PER_TEAM]; diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -158,8 +158,6 @@ // levelOneTaskDescr is init when starting the parallel region // top task descr is NULL (team master version will be fixed separately) topTaskDescr[tid] = NULL; - // no num threads value has been pushed - nextRegion.tnum[tid] = 0; // the following don't need to be init here; they are init when using dyn // sched // current_Event, events_Number, chunk, num_Iterations, schedule diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -73,7 +73,8 @@ } // This routine is always called by the team master.. -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) { +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + kmp_int32 NumThreadsClause) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; @@ -92,9 +93,6 @@ return; } - uint16_t &NumThreadsClause = - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - uint16_t NumThreads = determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); @@ -255,16 +253,6 @@ // push params //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, - int32_t num_threads) { - PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), - "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = - num_threads; -} - // Do nothing. The host guarantees we started the requested number of // teams and we only need inspection of gridDim. @@ -304,11 +292,7 @@ return; } - // Handle the num_threads clause. - if (num_threads != -1) - __kmpc_push_num_threads(ident, global_tid, num_threads); - - __kmpc_kernel_prepare_parallel((void *)wrapper_fn); + __kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads); if (nargs) { void **GlobalArgs; diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -220,8 +220,6 @@ // parallel EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, - int32_t num_threads); NOINLINE EXTERN uint8_t __kmpc_parallel_level(); // proc bind @@ -445,7 +443,8 @@ bool RequiresFullRuntime); EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, bool RequiresFullRuntime); -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int32_t NumThreadsClause); EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel();