diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -882,17 +882,45 @@ TryImperfectlyNestedLoops); } + /// Calls the specified callback function for all the loops in \p CurStmt, + /// from the outermost to the innermost. + static bool doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops, + unsigned NumLoops, + llvm::function_ref Callback, + llvm::function_ref + OnTransformationCallback); + static bool + doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops, + unsigned NumLoops, + llvm::function_ref Callback, + llvm::function_ref + OnTransformationCallback) { + auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) { + return Callback(Cnt, CurStmt); + }; + auto &&NewTransformCb = + [OnTransformationCallback](OMPLoopBasedDirective *A) { + OnTransformationCallback(A); + }; + return doForAllLoops(const_cast(CurStmt), TryImperfectlyNestedLoops, + NumLoops, NewCallback, NewTransformCb); + } + /// Calls the specified callback function for all the loops in \p CurStmt, /// from the outermost to the innermost. static bool doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, - llvm::function_ref Callback); + llvm::function_ref Callback) { + auto &&TransformCb = [](OMPLoopBasedDirective *) {}; + return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback, + TransformCb); + } static bool doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, llvm::function_ref Callback) { - auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) { + auto &&NewCallback = [Callback](unsigned Cnt, const Stmt *CurStmt) { return Callback(Cnt, CurStmt); }; return doForAllLoops(const_cast(CurStmt), TryImperfectlyNestedLoops, diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -124,11 +124,15 @@ bool OMPLoopBasedDirective::doForAllLoops( Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, - llvm::function_ref Callback) { + llvm::function_ref Callback, + llvm::function_ref + OnTransformationCallback) { CurStmt = CurStmt->IgnoreContainers(); for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) { - if (auto *Dir = dyn_cast(CurStmt)) + while (auto *Dir = dyn_cast(CurStmt)) { + OnTransformationCallback(Dir); CurStmt = Dir->getTransformedStmt(); + } if (auto *CanonLoop = dyn_cast(CurStmt)) CurStmt = CanonLoop->getLoopStmt(); if (Callback(Cnt, CurStmt)) diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1819,8 +1819,6 @@ return; } if (SimplifiedS == NextLoop) { - OMPTransformDirectiveScopeRAII PossiblyTransformDirectiveScope(CGF, - SimplifiedS); if (auto *Dir = dyn_cast(SimplifiedS)) SimplifiedS = Dir->getTransformedStmt(); if (const auto *CanonLoop = dyn_cast(SimplifiedS)) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -8964,6 +8964,18 @@ } } return false; + }, + [&SemaRef, &Captures](OMPLoopBasedDirective *Transform) { + Stmt *DependentPreInits = + cast(Transform)->getPreInits(); + if (!DependentPreInits) + return; + for (Decl *C : cast(DependentPreInits)->getDeclGroup()) { + auto *D = cast(C); + DeclRefExpr *Ref = buildDeclRefExpr(SemaRef, D, D->getType(), + Transform->getBeginLoc()); + Captures[Ref] = Ref; + } })) return 0; @@ -12564,7 +12576,8 @@ // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); Stmt *Body = nullptr; - SmallVector OriginalInits; + SmallVector, 0>, 4> + OriginalInits(1); if (!OMPLoopBasedDirective::doForAllLoops( AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops, @@ -12578,16 +12591,23 @@ return true; assert(SingleNumLoops == 1 && "Expect single loop iteration space"); if (auto *For = dyn_cast(CurStmt)) { - OriginalInits.push_back(For->getInit()); + OriginalInits.back().push_back(For->getInit()); Body = For->getBody(); } else { assert(isa(CurStmt) && "Expected canonical for or range-based for loops."); auto *CXXFor = cast(CurStmt); - OriginalInits.push_back(CXXFor->getBeginStmt()); + OriginalInits.back().push_back(CXXFor->getBeginStmt()); Body = CXXFor->getBody(); } + OriginalInits.emplace_back(); return false; + }, + [&OriginalInits](OMPLoopBasedDirective *Transform) { + Stmt *DependentPreInits = + cast(Transform)->getPreInits(); + for (Decl *C : cast(DependentPreInits)->getDeclGroup()) + OriginalInits.back().push_back(C); })) return StmtError(); @@ -12596,7 +12616,6 @@ return OMPTileDirective::Create(Context, StartLoc, EndLoc, Clauses, NumLoops, AStmt, nullptr, nullptr); - // Collection of generated variable declaration. SmallVector PreInits; // Create iteration variables for the generated loops. @@ -12606,8 +12625,7 @@ TileIndVars.resize(NumLoops); for (unsigned I = 0; I < NumLoops; ++I) { OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I]; - if (auto *PI = cast_or_null(LoopHelper.PreInits)) - PreInits.append(PI->decl_begin(), PI->decl_end()); + assert(LoopHelper.Counters.size() == 1 && "Expect single-dimensional loop iteration space"); auto *OrigCntVar = cast(LoopHelper.Counters.front()); @@ -12636,7 +12654,13 @@ TileCntDecl->setDeclName(&PP.getIdentifierTable().get(TileCntName)); TileIndVars[I] = TileCntDecl; } - if (auto *PI = dyn_cast_or_null(OriginalInits[I])) + for (auto &P : OriginalInits[I]) { + if (auto *D = P.dyn_cast()) + PreInits.push_back(D); + else if (auto *PI = dyn_cast_or_null(P.dyn_cast())) + PreInits.append(PI->decl_begin(), PI->decl_end()); + } + if (auto *PI = cast_or_null(LoopHelper.PreInits)) PreInits.append(PI->decl_begin(), PI->decl_end()); // Gather declarations for the data members used as counters. for (Expr *CounterRef : LoopHelper.Counters) { diff --git a/clang/test/OpenMP/tile_codegen.cpp b/clang/test/OpenMP/tile_codegen.cpp --- a/clang/test/OpenMP/tile_codegen.cpp +++ b/clang/test/OpenMP/tile_codegen.cpp @@ -378,17 +378,19 @@ // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_1_IV_J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_1_IV_J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 7, i32* [[J]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -418,8 +420,6 @@ // CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 5 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[J]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTFLOOR_1_IV_J]], align 4 // CHECK1-NEXT: br label [[FOR_COND:%.*]] // CHECK1: for.cond: @@ -521,18 +521,20 @@ // CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_1_IV_J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_1_IV_J:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 7, i32* [[J]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 3, i32* [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -571,8 +573,6 @@ // CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 5 // CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]] // CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[J]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTFLOOR_1_IV_J]], align 4 // CHECK1-NEXT: br label [[FOR_COND:%.*]] // CHECK1: for.cond: @@ -675,6 +675,7 @@ // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8 @@ -688,8 +689,8 @@ // CHECK1-NEXT: [[DOTFLOOR_0_IV_I11:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_0_IV_I12:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[J13:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP]], align 4 @@ -851,7 +852,6 @@ // CHECK1-NEXT: [[ADD91:%.*]] = add nsw i64 7, [[MUL90]] // CHECK1-NEXT: [[CONV92:%.*]] = trunc i64 [[ADD91]] to i32 // CHECK1-NEXT: store i32 [[CONV92]], i32* [[J13]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 // CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTTILE_0_IV_I12]], align 4 // CHECK1-NEXT: [[MUL93:%.*]] = mul nsw i32 [[TMP39]], 3 // CHECK1-NEXT: [[ADD94:%.*]] = add nsw i32 7, [[MUL93]] @@ -891,15 +891,16 @@ // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -931,7 +932,6 @@ // CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK1-NEXT: store i32 7, i32* [[I]], align 4 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTFLOOR_0_IV_I]], align 4 // CHECK1-NEXT: store i32 [[TMP8]], i32* [[DOTTILE_0_IV_I]], align 4 // CHECK1-NEXT: br label [[FOR_COND:%.*]] @@ -992,28 +992,28 @@ // CHECK1-NEXT: entry: // CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32 [[START]], i32* [[START_ADDR]], align 4 // CHECK1-NEXT: store i32 [[END]], i32* [[END_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, i32* [[START_ADDR]], align 4 -// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[END_ADDR]], align 4 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: store i32 [[TMP0]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]] // CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 // CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 3 // CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 3 // CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 // CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[START_ADDR]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[I]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTFLOOR_0_IV_I]], align 4 // CHECK1-NEXT: br label [[FOR_COND:%.*]] // CHECK1: for.cond: @@ -1372,17 +1372,19 @@ // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_1_IV_J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_1_IV_J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 7, i32* [[J]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -1412,8 +1414,6 @@ // CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP6]], 5 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[J]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTFLOOR_1_IV_J]], align 4 // CHECK2-NEXT: br label [[FOR_COND:%.*]] // CHECK2: for.cond: @@ -1515,18 +1515,20 @@ // CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_1_IV_J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_1_IV_J:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 7, i32* [[J]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK2-NEXT: store i32 3, i32* [[DOTOMP_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -1565,8 +1567,6 @@ // CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 5 // CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]] // CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[J]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTFLOOR_1_IV_J]], align 4 // CHECK2-NEXT: br label [[FOR_COND:%.*]] // CHECK2: for.cond: @@ -1669,6 +1669,7 @@ // CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8 @@ -1682,8 +1683,8 @@ // CHECK2-NEXT: [[DOTFLOOR_0_IV_I11:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_0_IV_I12:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[J13:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP]], align 4 @@ -1845,7 +1846,6 @@ // CHECK2-NEXT: [[ADD91:%.*]] = add nsw i64 7, [[MUL90]] // CHECK2-NEXT: [[CONV92:%.*]] = trunc i64 [[ADD91]] to i32 // CHECK2-NEXT: store i32 [[CONV92]], i32* [[J13]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 // CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTTILE_0_IV_I12]], align 4 // CHECK2-NEXT: [[MUL93:%.*]] = mul nsw i32 [[TMP39]], 3 // CHECK2-NEXT: [[ADD94:%.*]] = add nsw i32 7, [[MUL93]] @@ -1885,15 +1885,16 @@ // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 @@ -1925,7 +1926,6 @@ // CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTFLOOR_0_IV_I]], align 4 -// CHECK2-NEXT: store i32 7, i32* [[I]], align 4 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTFLOOR_0_IV_I]], align 4 // CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTILE_0_IV_I]], align 4 // CHECK2-NEXT: br label [[FOR_COND:%.*]] @@ -1986,28 +1986,28 @@ // CHECK2-NEXT: entry: // CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 [[START]], i32* [[START_ADDR]], align 4 // CHECK2-NEXT: store i32 [[END]], i32* [[END_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i32, i32* [[START_ADDR]], align 4 -// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[END_ADDR]], align 4 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: store i32 [[TMP0]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]] // CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1 // CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 3 // CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 3 // CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1 // CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[START_ADDR]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[I]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTFLOOR_0_IV_I]], align 4 // CHECK2-NEXT: br label [[FOR_COND:%.*]] // CHECK2: for.cond: diff --git a/clang/test/OpenMP/tile_codegen_for_dependent.cpp b/clang/test/OpenMP/tile_codegen_for_dependent.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/tile_codegen_for_dependent.cpp @@ -0,0 +1,193 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +// The loop trip count used by #pragma omp for depends on code generated +// by #pragma omp file. Check that theses PreInits are emitted before +// the code generated by #pragma omp for. + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV_I12:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] +// IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] +// IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 +// IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 +// IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -3 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 +// IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 +// IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP11]] +// IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_THEN]]: +// IR-NEXT: store i32 0, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: store i32 %[[TMP12]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: store i32 1, i32* %[[DOTOMP_STRIDE]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4 +// IR-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1) +// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[CMP13:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]] +// IR-NEXT: br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE]]: +// IR-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: br label %[[COND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE]]: +// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: br label %[[COND_END]] +// IR-EMPTY: +// IR-NEXT: [[COND_END]]: +// IR-NEXT: %[[COND:.+]] = phi i32 [ %[[TMP15]], %[[COND_TRUE]] ], [ %[[TMP16]], %[[COND_FALSE]] ] +// IR-NEXT: store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: store i32 %[[TMP17]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_COND]]: +// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[ADD14:.+]] = add i32 %[[TMP19]], 1 +// IR-NEXT: %[[CMP15:.+]] = icmp ult i32 %[[TMP18]], %[[ADD14]] +// IR-NEXT: br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_BODY]]: +// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP20]], 4 +// IR-NEXT: %[[ADD16:.+]] = add i32 0, %[[MUL]] +// IR-NEXT: store i32 %[[ADD16]], i32* %[[DOTFLOOR_0_IV_I12]], align 4 +// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 +// IR-NEXT: store i32 %[[TMP21]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD17:.+]] = add i32 %[[TMP23]], 1 +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 +// IR-NEXT: %[[ADD18:.+]] = add nsw i32 %[[TMP24]], 4 +// IR-NEXT: %[[CMP19:.+]] = icmp ult i32 %[[ADD17]], %[[ADD18]] +// IR-NEXT: br i1 %[[CMP19]], label %[[COND_TRUE20:.+]], label %[[COND_FALSE22:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE20]]: +// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD21:.+]] = add i32 %[[TMP25]], 1 +// IR-NEXT: br label %[[COND_END24:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE22]]: +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 +// IR-NEXT: %[[ADD23:.+]] = add nsw i32 %[[TMP26]], 4 +// IR-NEXT: br label %[[COND_END24]] +// IR-EMPTY: +// IR-NEXT: [[COND_END24]]: +// IR-NEXT: %[[COND25:.+]] = phi i32 [ %[[ADD21]], %[[COND_TRUE20]] ], [ %[[ADD23]], %[[COND_FALSE22]] ] +// IR-NEXT: %[[CMP26:.+]] = icmp ult i32 %[[TMP22]], %[[COND25]] +// IR-NEXT: br i1 %[[CMP26]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[MUL27:.+]] = mul i32 %[[TMP28]], %[[TMP29]] +// IR-NEXT: %[[ADD28:.+]] = add i32 %[[TMP27]], %[[MUL27]] +// IR-NEXT: store i32 %[[ADD28]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP30]], i32 %[[TMP31]], i32 %[[TMP32]], i32 %[[TMP33]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP34]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_BODY_CONTINUE]]: +// IR-NEXT: br label %[[OMP_INNER_FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_INC]]: +// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP35]], 1 +// IR-NEXT: store i32 %[[ADD29]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_END]]: +// IR-NEXT: br label %[[OMP_LOOP_EXIT:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_LOOP_EXIT]]: +// IR-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]]) +// IR-NEXT: br label %[[OMP_PRECOND_END]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_END]]: +// IR-NEXT: call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]]) +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { +#pragma omp for +#pragma omp tile sizes(4) + for (int i = start; i < end; i += step) + body(start, end, step, i); +} + +#endif /* HEADER */ diff --git a/clang/test/OpenMP/tile_codegen_tile_for.cpp b/clang/test/OpenMP/tile_codegen_tile_for.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/tile_codegen_tile_for.cpp @@ -0,0 +1,253 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +// Account for multiple transformations of a loop before consumed by +// #pragma omp for. + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTTILE_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] +// IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] +// IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 +// IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 +// IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -3 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 +// IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 +// IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[ADD13:.+]] = add i32 %[[TMP11]], 1 +// IR-NEXT: store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4 +// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 +// IR-NEXT: %[[SUB15:.+]] = sub i32 %[[TMP12]], -2 +// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[SUB15]], 3 +// IR-NEXT: %[[SUB17:.+]] = sub i32 %[[DIV16]], 1 +// IR-NEXT: store i32 %[[SUB17]], i32* %[[DOTCAPTURE_EXPR_14]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]] +// IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_THEN]]: +// IR-NEXT: store i32 0, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4 +// IR-NEXT: store i32 %[[TMP14]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: store i32 1, i32* %[[DOTOMP_STRIDE]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4 +// IR-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1) +// IR-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4 +// IR-NEXT: %[[CMP19:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]] +// IR-NEXT: br i1 %[[CMP19]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE]]: +// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4 +// IR-NEXT: br label %[[COND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE]]: +// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: br label %[[COND_END]] +// IR-EMPTY: +// IR-NEXT: [[COND_END]]: +// IR-NEXT: %[[COND:.+]] = phi i32 [ %[[TMP17]], %[[COND_TRUE]] ], [ %[[TMP18]], %[[COND_FALSE]] ] +// IR-NEXT: store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: store i32 %[[TMP19]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_COND]]: +// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[ADD20:.+]] = add i32 %[[TMP21]], 1 +// IR-NEXT: %[[CMP21:.+]] = icmp ult i32 %[[TMP20]], %[[ADD20]] +// IR-NEXT: br i1 %[[CMP21]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_BODY]]: +// IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP22]], 3 +// IR-NEXT: %[[ADD22:.+]] = add i32 0, %[[MUL]] +// IR-NEXT: store i32 %[[ADD22]], i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 +// IR-NEXT: store i32 %[[TMP23]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP25]], 1 +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 +// IR-NEXT: %[[ADD24:.+]] = add i32 %[[TMP26]], 3 +// IR-NEXT: %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]] +// IR-NEXT: br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE26]]: +// IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[ADD27:.+]] = add i32 %[[TMP27]], 1 +// IR-NEXT: br label %[[COND_END30:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE28]]: +// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 +// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP28]], 3 +// IR-NEXT: br label %[[COND_END30]] +// IR-EMPTY: +// IR-NEXT: [[COND_END30]]: +// IR-NEXT: %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ] +// IR-NEXT: %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]] +// IR-NEXT: br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END51:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[MUL33:.+]] = mul i32 %[[TMP29]], 4 +// IR-NEXT: %[[ADD34:.+]] = add i32 0, %[[MUL33]] +// IR-NEXT: store i32 %[[ADD34]], i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: store i32 %[[TMP30]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND35:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND35]]: +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD36:.+]] = add i32 %[[TMP32]], 1 +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[ADD37:.+]] = add nsw i32 %[[TMP33]], 4 +// IR-NEXT: %[[CMP38:.+]] = icmp ult i32 %[[ADD36]], %[[ADD37]] +// IR-NEXT: br i1 %[[CMP38]], label %[[COND_TRUE39:.+]], label %[[COND_FALSE41:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE39]]: +// IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD40:.+]] = add i32 %[[TMP34]], 1 +// IR-NEXT: br label %[[COND_END43:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE41]]: +// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[ADD42:.+]] = add nsw i32 %[[TMP35]], 4 +// IR-NEXT: br label %[[COND_END43]] +// IR-EMPTY: +// IR-NEXT: [[COND_END43]]: +// IR-NEXT: %[[COND44:.+]] = phi i32 [ %[[ADD40]], %[[COND_TRUE39]] ], [ %[[ADD42]], %[[COND_FALSE41]] ] +// IR-NEXT: %[[CMP45:.+]] = icmp ult i32 %[[TMP31]], %[[COND44]] +// IR-NEXT: br i1 %[[CMP45]], label %[[FOR_BODY46:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY46]]: +// IR-NEXT: %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP38:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[MUL47:.+]] = mul i32 %[[TMP37]], %[[TMP38]] +// IR-NEXT: %[[ADD48:.+]] = add i32 %[[TMP36]], %[[MUL47]] +// IR-NEXT: store i32 %[[ADD48]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP39:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP40:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP41:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP42:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP39]], i32 %[[TMP40]], i32 %[[TMP41]], i32 %[[TMP42]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP43:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP43]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: br label %[[FOR_INC49:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC49]]: +// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[INC50:.+]] = add i32 %[[TMP44]], 1 +// IR-NEXT: store i32 %[[INC50]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP4:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END51]]: +// IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_BODY_CONTINUE]]: +// IR-NEXT: br label %[[OMP_INNER_FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_INC]]: +// IR-NEXT: %[[TMP45:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD52:.+]] = add i32 %[[TMP45]], 1 +// IR-NEXT: store i32 %[[ADD52]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_END]]: +// IR-NEXT: br label %[[OMP_LOOP_EXIT:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_LOOP_EXIT]]: +// IR-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]]) +// IR-NEXT: br label %[[OMP_PRECOND_END]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_END]]: +// IR-NEXT: call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]]) +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { +#pragma omp for +#pragma omp tile sizes(3) +#pragma omp tile sizes(4) + for (int i = start; i < end; i += step) + body(start, end, step, i); +} + +#endif /* HEADER */ +// IR: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// IR: ![[META1:[0-9]+]] = !{!"clang version {{[^"]*}}"} +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOP4]] = distinct !{![[LOOP4]], ![[LOOPPROP3]]}