diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -407,6 +407,25 @@ bool NeedsBarrier, Value *Chunk = nullptr); + /// Insert doacross loop info in a workshare loop. + /// + /// In \p AllocaIP, allocate space for the loop bounds info. In the front of + /// \p PreHeaderBB, store \p DoacrossVars in the loop bounds info and call + /// doacross loop init runtime function. Call the fini doacross loop runtime + /// function in \p ExitBB. + /// + /// \param DL Debug location for instructions. + /// \param AllocaIP An insertion point for Alloca instructions. + /// \param PreHeaderBB The preheader basic block of the loop. + /// \param ExitBB The exit basic block of the loop. + /// \param OrderedVal The ordered parameter (n) specified in ordered clause. + /// \param DoacrossVars The lower bounds, upper bounds, and steps of n outer + /// loops. + void applyDoacrossLoop(DebugLoc DL, InsertPointTy AllocaIP, + BasicBlock *PreHeaderBB, BasicBlock *ExitBB, + std::int64_t OrderedVal, + ArrayRef DoacrossVars); + /// Modifies the canonical loop to be a workshare loop. /// /// This takes a \p LoopInfo representing a canonical loop, such as the one diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1549,16 +1549,16 @@ CLI->getExit()->getTerminator()->getIterator()); Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); + Builder.restoreIP(CLI->getAfterIP()); // Add the barrier if requested. if (NeedsBarrier) createBarrier(LocationDescription(Builder.saveIP(), DL), omp::Directive::OMPD_for, /* ForceSimpleCall */ false, /* CheckCancelFlag */ false); - InsertPointTy AfterIP = CLI->getAfterIP(); CLI->invalidate(); - return AfterIP; + return Builder.saveIP(); } OpenMPIRBuilder::InsertPointTy @@ -1699,16 +1699,79 @@ assert(BI->getSuccessor(1) == Exit); BI->setSuccessor(1, OuterCond); + Builder.restoreIP(AfterIP); // Add the barrier if requested. - if (NeedsBarrier) { - Builder.SetInsertPoint(&Exit->back()); + if (NeedsBarrier) createBarrier(LocationDescription(Builder.saveIP(), DL), omp::Directive::OMPD_for, /* ForceSimpleCall */ false, /* CheckCancelFlag */ false); - } CLI->invalidate(); - return AfterIP; + return Builder.saveIP(); +} + +void OpenMPIRBuilder::applyDoacrossLoop(DebugLoc DL, InsertPointTy AllocaIP, + BasicBlock *PreHeaderBB, + BasicBlock *ExitBB, + std::int64_t OrderedVal, + ArrayRef DoacrossVars) { + assert(DoacrossVars[0]->getType()->isIntegerTy(64) && + "Doacross init runtime call requires loop bounds info with i64 type"); + // Set up the source location value for OpenMP runtime. + Builder.SetInsertPoint(&PreHeaderBB->front()); + Builder.SetCurrentDebugLocation(DL); + + Constant *SrcLocStr = getOrCreateSrcLocStr(DL); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Allocate space for loop bounds and generate alloc instruction. + SmallVector ElementsTys; + ElementsTys.emplace_back(Int64); // lower + ElementsTys.emplace_back(Int64); // upper + ElementsTys.emplace_back(Int64); // stride(step) + auto *KmpDimTy = StructType::create(ElementsTys, "kmp_dim"); + auto *DimsTy = ArrayType::get(KmpDimTy, OrderedVal); + + Builder.restoreIP(AllocaIP); + AllocaInst *DimsInst = Builder.CreateAlloca(DimsTy, nullptr, "dims"); + DimsInst->setAlignment(Align(8)); + + // Emit doacross init call in preheader front. + Builder.SetInsertPoint(&PreHeaderBB->front()); + + // Store doacross loop vars in loop bounds. + for (std::int64_t I = 0; I < OrderedVal; I++) { + Value *LoopBounds = Builder.CreateInBoundsGEP( + DimsTy, DimsInst, {Builder.getInt64(0), Builder.getInt64(I)}); + Value *LowerBound = Builder.CreateInBoundsGEP( + KmpDimTy, LoopBounds, {Builder.getInt32(0), Builder.getInt32(0)}); + StoreInst *LBInst = Builder.CreateStore(DoacrossVars[I * 3], LowerBound); + LBInst->setAlignment(Align(8)); + Value *UpperBound = Builder.CreateInBoundsGEP( + KmpDimTy, LoopBounds, {Builder.getInt32(0), Builder.getInt32(1)}); + StoreInst *UBInst = + Builder.CreateStore(DoacrossVars[I * 3 + 1], UpperBound); + UBInst->setAlignment(Align(8)); + Value *Step = Builder.CreateInBoundsGEP( + KmpDimTy, LoopBounds, {Builder.getInt32(0), Builder.getInt32(2)}); + StoreInst *StepInst = Builder.CreateStore(DoacrossVars[I * 3 + 2], Step); + StepInst->setAlignment(Align(8)); + } + + Value *LoopBoundsBase = Builder.CreateInBoundsGEP( + DimsTy, DimsInst, {Builder.getInt64(0), Builder.getInt64(0)}); + Value *LoopBoundsBaseInt8Ptr = Builder.CreateBitCast(LoopBoundsBase, Int8Ptr); + + Value *ThreadId = getOrCreateThreadID(SrcLoc); + Function *RTLFnInit = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_init); + Builder.CreateCall(RTLFnInit, {SrcLoc, ThreadId, Builder.getInt32(OrderedVal), + LoopBoundsBaseInt8Ptr}); + + Builder.SetInsertPoint(&ExitBB->back()); + Function *RTLFnFini = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_fini); + Builder.CreateCall(RTLFnFini, {SrcLoc, ThreadId}); } /// Make \p Source branch to \p Target. diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1764,11 +1764,13 @@ BasicBlock *Body = CLI->getBody(); Value *IV = CLI->getIndVar(); BasicBlock *ExitBlock = CLI->getExit(); + BasicBlock *AfterBlock = CLI->getAfter(); Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); InsertPointTy AllocaIP = Builder.saveIP(); - OMPBuilder.applyStaticWorkshareLoop(DL, CLI, AllocaIP, /*NeedsBarrier=*/true); + InsertPointTy EndIP = OMPBuilder.applyStaticWorkshareLoop( + DL, CLI, AllocaIP, /*NeedsBarrier=*/true); BasicBlock *Cond = Body->getSinglePredecessor(); Instruction *Cmp = &*Cond->begin(); @@ -1834,11 +1836,22 @@ // increment and in the statement that adds the lower bound to it. EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); - // The exit block should contain the "fini" call and the barrier call, - // plus the call to obtain the thread ID. + // The exit block should contain the "fini" call. size_t NumCallsInExitBlock = count_if(*ExitBlock, [](Instruction &I) { return isa(I); }); - EXPECT_EQ(NumCallsInExitBlock, 3u); + EXPECT_EQ(NumCallsInExitBlock, 1u); + + // The after block should contain the barrier call, plus the call to obtain + // the thread ID. + size_t NumCallsInAfterBlock = + count_if(*AfterBlock, [](Instruction &I) { return isa(I); }); + EXPECT_EQ(NumCallsInAfterBlock, 2u); + + // Add a termination to our block and check that it is internally consistent. + Builder.restoreIP(EndIP); + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); } TEST_P(OpenMPIRBuilderTestWithParams, DynamicWorkShareLoop) { @@ -1882,7 +1895,7 @@ // createDynamicWorkshareLoop. InsertPointTy AfterIP = CLI->getAfterIP(); BasicBlock *Preheader = CLI->getPreheader(); - BasicBlock *ExitBlock = CLI->getExit(); + BasicBlock *AfterBlock = CLI->getAfter(); Value *IV = CLI->getIndVar(); InsertPointTy EndIP = @@ -1944,11 +1957,11 @@ // increment and in the statement that adds the lower bound to it. EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); - // The exit block should contain the barrier call, plus the call to obtain + // The after block should contain the barrier call, plus the call to obtain // the thread ID. - size_t NumCallsInExitBlock = - count_if(*ExitBlock, [](Instruction &I) { return isa(I); }); - EXPECT_EQ(NumCallsInExitBlock, 2u); + size_t NumCallsInAfterBlock = + count_if(*AfterBlock, [](Instruction &I) { return isa(I); }); + EXPECT_EQ(NumCallsInAfterBlock, 2u); // Add a termination to our block and check that it is internally consistent. Builder.restoreIP(EndIP); @@ -1975,6 +1988,139 @@ omp::OMPScheduleType::Runtime | omp::OMPScheduleType::ModifierMonotonic)); +TEST_F(OpenMPIRBuilderTest, DoacrossLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Type *LCTy = Type::getInt32Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, 10); + Value *StopVal = ConstantInt::get(LCTy, 52); + Value *StepVal = ConstantInt::get(LCTy, 2); + auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {}; + + CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop( + Loc, LoopBodyGen, StartVal, StopVal, StepVal, + /*IsSigned=*/false, /*InclusiveStop=*/false); + BasicBlock *Preheader = CLI->getPreheader(); + BasicBlock *ExitBlock = CLI->getExit(); + + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + InsertPointTy AllocaIP = Builder.saveIP(); + + InsertPointTy EndIP = OMPBuilder.applyStaticWorkshareLoop( + DL, CLI, AllocaIP, /*NeedsBarrier=*/true); + + SmallVector DoacrossVars; + Type *I64Ty = Type::getInt64Ty(Ctx); + DoacrossVars.emplace_back(ConstantInt::get(I64Ty, 10)); + DoacrossVars.emplace_back(ConstantInt::get(I64Ty, 52)); + DoacrossVars.emplace_back(ConstantInt::get(I64Ty, 2)); + std::int64_t OrderedVal = 1; + OMPBuilder.applyDoacrossLoop(DL, AllocaIP, Preheader, ExitBlock, OrderedVal, + DoacrossVars); + + auto AllocaIter = BB->begin(); + ASSERT_GE(std::distance(BB->begin(), BB->end()), 5); + AllocaIter++; // PLastIter + AllocaIter++; // PLowerBound + AllocaIter++; // PUpperBound + AllocaIter++; // PStride + AllocaInst *DIMS = dyn_cast(&*(AllocaIter)); + EXPECT_NE(DIMS, nullptr); + EXPECT_TRUE(DIMS->getAllocatedType()->isArrayTy()); + EXPECT_EQ(DIMS->getArraySize(), ConstantInt::get(LCTy, 1)); + EXPECT_EQ(DIMS->getAlignment(), 8); + Type *KmpDimTy = DIMS->getAllocatedType()->getArrayElementType(); + EXPECT_TRUE(KmpDimTy->isStructTy()); + EXPECT_EQ(KmpDimTy->getStructNumElements(), 3); + EXPECT_TRUE(KmpDimTy->getStructElementType(0)->isIntegerTy(64)); + EXPECT_TRUE(KmpDimTy->getStructElementType(1)->isIntegerTy(64)); + EXPECT_TRUE(KmpDimTy->getStructElementType(2)->isIntegerTy(64)); + + auto PreheaderIter = Preheader->begin(); + ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 17); + GetElementPtrInst *ADDR = dyn_cast(&*(PreheaderIter++)); + GetElementPtrInst *GEPLB = dyn_cast(&*(PreheaderIter++)); + StoreInst *StoreLB = dyn_cast(&*(PreheaderIter++)); + GetElementPtrInst *GEPUB = dyn_cast(&*(PreheaderIter++)); + StoreInst *StoreUB = dyn_cast(&*(PreheaderIter++)); + GetElementPtrInst *GEPStep = dyn_cast(&*(PreheaderIter++)); + StoreInst *StoreStep = dyn_cast(&*(PreheaderIter++)); + GetElementPtrInst *Base = dyn_cast(&*(PreheaderIter++)); + BitCastInst *BaseI8 = dyn_cast(&*(PreheaderIter++)); + CallInst *InitGTID = dyn_cast(&*(PreheaderIter++)); + CallInst *DoacrossInit = dyn_cast(&*(PreheaderIter++)); + EXPECT_NE(ADDR, nullptr); + EXPECT_NE(GEPLB, nullptr); + EXPECT_NE(StoreLB, nullptr); + EXPECT_NE(GEPUB, nullptr); + EXPECT_NE(StoreUB, nullptr); + EXPECT_NE(GEPStep, nullptr); + EXPECT_NE(StoreStep, nullptr); + EXPECT_NE(Base, nullptr); + EXPECT_NE(BaseI8, nullptr); + EXPECT_NE(InitGTID, nullptr); + EXPECT_NE(DoacrossInit, nullptr); + EXPECT_EQ(ADDR->getNumOperands(), 3); + EXPECT_EQ(ADDR->getOperand(0), DIMS); + EXPECT_EQ(ADDR->getOperand(1), ConstantInt::get(I64Ty, 0)); + EXPECT_EQ(ADDR->getOperand(2), ConstantInt::get(I64Ty, 0)); + EXPECT_EQ(GEPLB->getNumOperands(), 3); + EXPECT_EQ(GEPLB->getOperand(0), ADDR); + EXPECT_EQ(GEPLB->getOperand(1), ConstantInt::get(LCTy, 0)); + EXPECT_EQ(GEPLB->getOperand(2), ConstantInt::get(LCTy, 0)); + EXPECT_EQ(StoreLB->getNumOperands(), 2); + EXPECT_EQ(StoreLB->getOperand(0), DoacrossVars[0]); + EXPECT_EQ(StoreLB->getOperand(1), GEPLB); + EXPECT_EQ(StoreLB->getAlignment(), 8); + EXPECT_EQ(GEPUB->getNumOperands(), 3); + EXPECT_EQ(GEPUB->getOperand(0), ADDR); + EXPECT_EQ(GEPUB->getOperand(1), ConstantInt::get(LCTy, 0)); + EXPECT_EQ(GEPUB->getOperand(2), ConstantInt::get(LCTy, 1)); + EXPECT_EQ(StoreUB->getNumOperands(), 2); + EXPECT_EQ(StoreUB->getOperand(0), DoacrossVars[1]); + EXPECT_EQ(StoreUB->getOperand(1), GEPUB); + EXPECT_EQ(StoreUB->getAlignment(), 8); + EXPECT_EQ(GEPStep->getNumOperands(), 3); + EXPECT_EQ(GEPStep->getOperand(0), ADDR); + EXPECT_EQ(GEPStep->getOperand(1), ConstantInt::get(LCTy, 0)); + EXPECT_EQ(GEPStep->getOperand(2), ConstantInt::get(LCTy, 2)); + EXPECT_EQ(StoreStep->getNumOperands(), 2); + EXPECT_EQ(StoreStep->getOperand(0), DoacrossVars[2]); + EXPECT_EQ(StoreStep->getOperand(1), GEPStep); + EXPECT_EQ(StoreStep->getAlignment(), 8); + EXPECT_EQ(Base->getNumOperands(), 3); + EXPECT_EQ(Base->getOperand(0), DIMS); + EXPECT_EQ(Base->getOperand(1), ConstantInt::get(I64Ty, 0)); + EXPECT_EQ(Base->getOperand(2), ConstantInt::get(I64Ty, 0)); + EXPECT_EQ(BaseI8->getNumOperands(), 1); + EXPECT_EQ(BaseI8->getOperand(0), Base); + EXPECT_EQ(InitGTID->getCalledFunction()->getName(), + "__kmpc_global_thread_num"); + EXPECT_EQ(DoacrossInit->getCalledFunction()->getName(), + "__kmpc_doacross_init"); + EXPECT_EQ(DoacrossInit->getNumOperands(), 5); + EXPECT_EQ(DoacrossInit->getOperand(2), ConstantInt::get(LCTy, OrderedVal)); + EXPECT_EQ(DoacrossInit->getOperand(3), BaseI8); + + auto ExitIter = ExitBlock->begin(); + ASSERT_GE(std::distance(ExitBlock->begin(), ExitBlock->end()), 2); + ExitIter++; // __kmpc_for_static_fini + CallInst *DoacrossFini = dyn_cast(&*(ExitIter++)); + EXPECT_NE(DoacrossFini, nullptr); + EXPECT_EQ(DoacrossFini->getCalledFunction()->getName(), + "__kmpc_doacross_fini"); + + // Add a termination to our block and check that it is internally consistent. + Builder.restoreIP(EndIP); + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); +} + TEST_F(OpenMPIRBuilderTest, MasterDirective) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); @@ -3600,22 +3746,29 @@ EXPECT_EQ(FoundForInit, true); bool FoundForExit = false; - bool FoundBarrier = false; for (Instruction &Inst : *ForExitBB) { if (isa(Inst)) { if (cast(&Inst)->getCalledFunction()->getName() == "__kmpc_for_static_fini") { FoundForExit = true; + break; } + } + } + EXPECT_EQ(FoundForExit, true); + + BasicBlock *ForAfterBB = ForExitBB->getSingleSuccessor(); + EXPECT_NE(ForAfterBB, nullptr); + bool FoundBarrier = false; + for (Instruction &Inst : *ForAfterBB) { + if (isa(Inst)) { if (cast(&Inst)->getCalledFunction()->getName() == "__kmpc_barrier") { FoundBarrier = true; - } - if (FoundForExit && FoundBarrier) break; + } } } - EXPECT_EQ(FoundForExit, true); EXPECT_EQ(FoundBarrier, true); EXPECT_NE(SwitchBB, nullptr); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -273,6 +273,10 @@ The optional `ordered_val` attribute specifies how many loops are associated with the do loop construct. + The `doacross_vars` are arguments of doacross loop nest, which is formed by + "n" outer loops when the parameter "n" is in ordered clause. The arguments + store the loop bounds info, which is required in doacorss init runtime call. + The optional `order` attribute specifies which order the iterations of the associate loops are executed in. Currently the only option for this attribute is "concurrent". @@ -295,6 +299,7 @@ Confined, [IntMinValue<0>]>:$collapse_val, UnitAttr:$nowait, Confined, [IntMinValue<0>]>:$ordered_val, + Variadic:$doacross_vars, OptionalAttr:$order_val, UnitAttr:$inclusive); @@ -311,8 +316,9 @@ "ValueRange":$linear_step_vars, "ValueRange":$reduction_vars, "StringAttr":$schedule_val, "Value":$schedule_chunk_var, "IntegerAttr":$collapse_val, "UnitAttr":$nowait, - "IntegerAttr":$ordered_val, "StringAttr":$order_val, - "UnitAttr":$inclusive, CArg<"bool", "true">:$buildBody)>, + "IntegerAttr":$ordered_val, "ValueRange":$doacross_vars, + "StringAttr":$order_val, "UnitAttr":$inclusive, + CArg<"bool", "true">:$buildBody)>, OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$operands, CArg<"ArrayRef", "{}">:$attributes)> ]; diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -536,6 +536,7 @@ collapseClause, orderClause, orderedClause, + doacrossVirturalClause, memoryOrderClause, hintClause, COUNT @@ -608,6 +609,11 @@ SmallVector linearTypes; SmallVector linearSteps; + // "doacross" is not one real clause and it is attached with "ordered" clause + // when ordered value is greater than 0. + SmallVector doacrossVars; + SmallVector doacrossTypes; + SmallString<8> schedule; SmallVector> modifiers; Optional scheduleChunkSize; @@ -742,17 +748,18 @@ result.addAttribute("collapse_val", attr); } else if (clauseKeyword == "ordered") { mlir::IntegerAttr attr; - if (checkAllowed(orderedClause)) + auto type = parser.getBuilder().getI64Type(); + if (checkAllowed(orderedClause) || parser.parseLParen() || + parser.parseAttribute(attr, type) || parser.parseRParen()) return failure(); - if (succeeded(parser.parseOptionalLParen())) { - auto type = parser.getBuilder().getI64Type(); - if (parser.parseAttribute(attr, type) || parser.parseRParen()) + result.addAttribute("ordered_val", attr); + if (attr.getValue().getSExtValue() > 0) { + if (checkAllowed(doacrossVirturalClause) || + parser.parseKeyword("doacross") || + parseOperandAndTypeList(parser, doacrossVars, doacrossTypes)) return failure(); - } else { - // Use 0 to represent no ordered parameter was specified - attr = parser.getBuilder().getI64IntegerAttr(0); + clauseSegments[pos[doacrossVirturalClause]] = doacrossVars.size(); } - result.addAttribute("ordered_val", attr); } else if (clauseKeyword == "order") { StringRef order; if (checkAllowed(orderClause) || parser.parseLParen() || @@ -880,6 +887,13 @@ } } + // Add ordered doacross parameters + if (done[doacrossVirturalClause] && + clauseSegments[pos[doacrossVirturalClause]] && + failed(parser.resolveOperands(doacrossVars, doacrossTypes, + doacrossVars[0].location, result.operands))) + return failure(); + segments.insert(segments.end(), clauseSegments.begin(), clauseSegments.end()); return success(); @@ -1040,9 +1054,9 @@ return failure(); SmallVector clauses = { - privateClause, firstprivateClause, lastprivateClause, linearClause, - reductionClause, collapseClause, orderClause, orderedClause, - nowaitClause, scheduleClause}; + privateClause, firstprivateClause, lastprivateClause, linearClause, + reductionClause, collapseClause, orderClause, orderedClause, + nowaitClause, scheduleClause, doacrossVirturalClause}; SmallVector segments{numIVs, numIVs, numIVs}; if (failed(parseClauses(parser, result, clauses, segments))) return failure(); @@ -1085,8 +1099,11 @@ if (op.nowait()) p << "nowait "; - if (auto ordered = op.ordered_val()) + if (auto ordered = op.ordered_val()) { p << "ordered(" << ordered << ") "; + if (ordered.getValue() > 0) + printDataVars(p, op.doacross_vars(), "doacross"); + } if (auto order = op.order_val()) p << "order(" << order << ") "; @@ -1190,7 +1207,8 @@ /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(), /*schedule_val=*/nullptr, /*schedule_chunk_var=*/nullptr, /*collapse_val=*/nullptr, - /*nowait=*/nullptr, /*ordered_val=*/nullptr, /*order_val=*/nullptr, + /*nowait=*/nullptr, /*ordered_val=*/nullptr, + /*doacross_vars=*/ValueRange(), /*order_val=*/nullptr, /*inclusive=*/nullptr, /*buildBody=*/false); state.addAttributes(attributes); } @@ -1212,8 +1230,8 @@ ValueRange linearStepVars, ValueRange reductionVars, StringAttr scheduleVal, Value scheduleChunkVar, IntegerAttr collapseVal, UnitAttr nowait, - IntegerAttr orderedVal, StringAttr orderVal, - UnitAttr inclusive, bool buildBody) { + IntegerAttr orderedVal, ValueRange doacrossVars, + StringAttr orderVal, UnitAttr inclusive, bool buildBody) { result.addOperands(lowerBounds); result.addOperands(upperBounds); result.addOperands(steps); @@ -1223,6 +1241,7 @@ result.addOperands(linearStepVars); if (scheduleChunkVar) result.addOperands(scheduleChunkVar); + result.addOperands(doacrossVars); if (scheduleVal) result.addAttribute("schedule_val", scheduleVal); @@ -1248,7 +1267,8 @@ static_cast(linearVars.size()), static_cast(linearStepVars.size()), static_cast(reductionVars.size()), - static_cast(scheduleChunkVar != nullptr ? 1 : 0)})); + static_cast(scheduleChunkVar != nullptr ? 1 : 0), + static_cast(doacrossVars.size())})); Region *bodyRegion = result.addRegion(); if (buildBody) { diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -749,9 +749,9 @@ return failure(); } - // Collapse loops. Store the insertion point because LoopInfos may get + // Collapse loops. Store the basic block because LoopInfos may get // invalidated. - llvm::IRBuilderBase::InsertPoint afterIP = loopInfos.front()->getAfterIP(); + llvm::BasicBlock *afterBB = loopInfos.front()->getAfter(); llvm::CanonicalLoopInfo *loopInfo = ompBuilder->collapseLoops(diLoc, loopInfos, {}); @@ -759,6 +759,12 @@ bool isSimd = loop.simd_modifier(); + // Store the BBs since loopInfo get invalidated after apply*WorkshareLoop. + llvm::BasicBlock *preHeaderBB = loopInfo->getPreheader(); + llvm::BasicBlock *exitBB = loopInfo->getExit(); + + std::int64_t orderedVal = + loop.ordered_val().hasValue() ? loop.ordered_val().getValue() : -1; if (schedule == omp::ClauseScheduleKind::Static) { ompBuilder->applyStaticWorkshareLoop(ompLoc.DL, loopInfo, allocaIP, !loop.nowait(), chunk); @@ -803,15 +809,23 @@ break; } } - afterIP = ompBuilder->applyDynamicWorkshareLoop( - ompLoc.DL, loopInfo, allocaIP, schedType, !loop.nowait(), chunk); + ompBuilder->applyDynamicWorkshareLoop(ompLoc.DL, loopInfo, allocaIP, + schedType, !loop.nowait(), chunk); + } + + if (orderedVal > 0) { + SmallVector doacrossVars = + moduleTranslation.lookupValues(loop.doacross_vars()); + ompBuilder->applyDoacrossLoop(ompLoc.DL, allocaIP, preHeaderBB, exitBB, + orderedVal, doacrossVars); } // Continue building IR after the loop. Note that the LoopInfo returned by // `collapseLoops` points inside the outermost loop and is intended for - // potential further loop transformations. Use the insertion point stored - // before collapsing loops instead. - builder.restoreIP(afterIP); + // potential further loop transformations. Use the after basic block stored + // before collapsing loops instead and insert the created instructions + // appended to the after basic block. + builder.SetInsertPoint(afterBB, afterBB->end()); // Process the reductions if required. if (numReductions == 0) diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -57,7 +57,7 @@ // CHECK: "test.payload"(%[[CAST_ARG6]], %[[CAST_ARG7]]) : (index, index) -> () "test.payload"(%arg6, %arg7) : (index, index) -> () omp.yield - }) {operand_segment_sizes = dense<[2, 2, 2, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (index, index, index, index, index, index) -> () + }) {operand_segment_sizes = dense<[2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]> : vector<11xi32>} : (index, index, index, index, index, index) -> () omp.terminator } return diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -120,7 +120,7 @@ func @ordered_not_allowed() { // expected-error@+1 {{ordered is not a valid clause for the omp.parallel operation}} - omp.parallel ordered(2) {} + omp.parallel ordered(0) {} } // ----- @@ -448,8 +448,8 @@ // ----- -func @omp_ordered1(%arg1 : i32, %arg2 : i32, %arg3 : i32) -> () { - omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) { +func @omp_ordered1(%arg1 : i32, %arg2 : i32, %arg3 : i32, %doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64) -> () { + omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) doacross(%doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64) { // expected-error @below {{ordered region must be closely nested inside a worksharing-loop region with an ordered clause without parameter present}} omp.ordered_region { omp.terminator @@ -493,8 +493,8 @@ } // ----- -func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec1 : i64) -> () { - omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) { +func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec1 : i64, %doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64) -> () { + omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) doacross(%doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64) { // expected-error @below {{number of variables in depend clause does not match number of iteration variables in the doacross loop}} omp.ordered depend_type("dependsource") depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64} @@ -794,7 +794,7 @@ func @omp_sections() { // expected-error @below {{ordered is not a valid clause for the omp.sections operation}} - omp.sections ordered(2) { + omp.sections ordered(0) { omp.terminator } return diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -147,52 +147,51 @@ } // CHECK-LABEL: omp_wsloop -func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memref, %linear_var : i32, %chunk_var : i32) -> () { +func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memref, %linear_var : i32, %chunk_var : i32, %doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64, %doacross_var4 : i64, %doacross_var5 : i64, %doacross_var6 : i64) -> () { - // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref, %{{.*}} : memref) collapse(2) ordered(1) - "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref, %{{.*}} : memref) collapse(2) ordered(1) doacross(%{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) + "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %doacross_var1, %doacross_var2, %doacross_var3) ({ ^bb0(%iv: index): omp.yield - }) {operand_segment_sizes = dense<[1,1,1,2,0,0,0,0,0,0]> : vector<10xi32>, collapse_val = 2, ordered_val = 1} : - (index, index, index, memref, memref) -> () + }) {operand_segment_sizes = dense<[1,1,1,2,0,0,0,0,0,0,3]> : vector<11xi32>, collapse_val = 2, ordered_val = 1} : + (index, index, index, memref, memref, i64, i64, i64) -> () // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref) schedule(static) "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({ ^bb0(%iv: index): omp.yield - }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0,0]> : vector<10xi32>, schedule_val = "Static"} : + }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0,0,0]> : vector<11xi32>, schedule_val = "Static"} : (index, index, index, memref, i32) -> () // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref, %{{.*}} = %{{.*}} : memref) schedule(static) "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({ ^bb0(%iv: index): omp.yield - }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0,0]> : vector<10xi32>, schedule_val = "Static"} : + }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0,0,0]> : vector<11xi32>, schedule_val = "Static"} : (index, index, index, memref, memref, i32, i32) -> () - // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}) collapse(3) ordered(2) - "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}) collapse(3) ordered(2) doacross(%{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) + "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var, %doacross_var1, %doacross_var2, %doacross_var3, %doacross_var4, %doacross_var5, %doacross_var6) ({ ^bb0(%iv: index): omp.yield - }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,0,1]> : vector<10xi32>, schedule_val = "Dynamic", collapse_val = 3, ordered_val = 2} : - (index, index, index, memref, memref, memref, memref, i32, i32) -> () + }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,0,1,6]> : vector<11xi32>, schedule_val = "Dynamic", collapse_val = 3, ordered_val = 2} : + (index, index, index, memref, memref, memref, memref, i32, i32, i64, i64, i64, i64, i64, i64) -> () // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) schedule(auto) nowait "omp.wsloop" (%lb, %ub, %step, %data_var) ({ ^bb0(%iv: index): omp.yield - }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0,0]> : vector<10xi32>, nowait, schedule_val = "Auto"} : + }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0,0,0]> : vector<11xi32>, nowait, schedule_val = "Auto"} : (index, index, index, memref) -> () return } // CHECK-LABEL: omp_wsloop_pretty -func @omp_wsloop_pretty(%lb : index, %ub : index, %step : index, - %data_var : memref, %linear_var : i32, %chunk_var : i32) -> () { +func @omp_wsloop_pretty(%lb : index, %ub : index, %step : index, %data_var : memref, %linear_var : i32, %chunk_var : i32, %doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64, %doacross_var4 : i64, %doacross_var5 : i64, %doacross_var6 : i64) -> () { // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) - omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) collapse(2) ordered(2) { + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) collapse(2) { omp.yield } @@ -201,22 +200,22 @@ omp.yield } - // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(static = %{{.*}}) collapse(3) ordered(2) - omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref) + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(static = %{{.*}}) collapse(3) ordered(2) doacross(%{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) doacross(%doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64, %doacross_var4 : i64, %doacross_var5 : i64, %doacross_var6 : i64) private(%data_var : memref) firstprivate(%data_var : memref) lastprivate(%data_var : memref) linear(%data_var = %linear_var : memref) schedule(static = %chunk_var) collapse(3) { omp.yield } - // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}, nonmonotonic) collapse(3) ordered(2) - omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref) + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}, nonmonotonic) collapse(3) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) firstprivate(%data_var : memref) lastprivate(%data_var : memref) linear(%data_var = %linear_var : memref) schedule(dynamic = %chunk_var, nonmonotonic) collapse(3) { omp.yield } - // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}, monotonic) collapse(3) ordered(2) - omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref) + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}, monotonic) collapse(3) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) firstprivate(%data_var : memref) lastprivate(%data_var : memref) linear(%data_var = %linear_var : memref) schedule(dynamic = %chunk_var, monotonic) collapse(3) { omp.yield @@ -450,8 +449,7 @@ return } -func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32, - %vec0 : i64, %vec1 : i64, %vec2 : i64, %vec3 : i64) -> () { +func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec1 : i64, %vec2 : i64, %vec3 : i64, %doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64, %doacross_var4 : i64, %doacross_var5 : i64, %doacross_var6 : i64) -> () { // CHECK: omp.ordered_region omp.ordered_region { // CHECK: omp.terminator @@ -465,7 +463,7 @@ omp.yield } - omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) { + omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(1) doacross(%doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64) { // Only one DEPEND(SINK: vec) clause // CHECK: omp.ordered depend_type("dependsink") depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64} omp.ordered depend_type("dependsink") depend_vec(%vec0 : i64) {num_loops_val = 1 : i64} @@ -476,7 +474,7 @@ omp.yield } - omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(2) { + omp.wsloop (%0) : i32 = (%arg1) to (%arg2) step (%arg3) ordered(2) doacross(%doacross_var1 : i64, %doacross_var2 : i64, %doacross_var3 : i64, %doacross_var4 : i64, %doacross_var5 : i64, %doacross_var6 : i64) { // Multiple DEPEND(SINK: vec) clauses // CHECK: omp.ordered depend_type("dependsink") depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64} omp.ordered depend_type("dependsink") depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64} diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -379,7 +379,7 @@ llvm.store %3, %4 : !llvm.ptr omp.yield // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @[[$wsloop_loc_struct]], - }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> () + }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]> : vector<11xi32>} : (i64, i64, i64) -> () omp.terminator } llvm.return @@ -399,7 +399,7 @@ %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr llvm.store %3, %4 : !llvm.ptr omp.yield - }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> () + }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]> : vector<11xi32>} : (i64, i64, i64) -> () llvm.return } @@ -417,7 +417,7 @@ %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr llvm.store %3, %4 : !llvm.ptr omp.yield - }) {inclusive, operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> () + }) {inclusive, operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]> : vector<11xi32>} : (i64, i64, i64) -> () llvm.return } @@ -631,9 +631,103 @@ // ----- -// CHECK-LABEL: @omp_ordered -llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64, - %arg4: i64, %arg5: i64, %arg6: i64) -> () { +// Check that the loop bounds are emitted in the correct location in case of +// collapse for dynamic schedule. This only checks the overall shape of the IR, +// detailed checking is done by the OpenMPIRBuilder. + +// CHECK-LABEL: @collapse_wsloop_dynamic +// CHECK: i32* noalias %[[TIDADDR:[0-9A-Za-z.]*]] +// CHECK: load i32, i32* %[[TIDADDR]] +// CHECK: store +// CHECK: load +// CHECK: %[[LB0:.*]] = load i32 +// CHECK: %[[UB0:.*]] = load i32 +// CHECK: %[[STEP0:.*]] = load i32 +// CHECK: %[[LB1:.*]] = load i32 +// CHECK: %[[UB1:.*]] = load i32 +// CHECK: %[[STEP1:.*]] = load i32 +// CHECK: %[[LB2:.*]] = load i32 +// CHECK: %[[UB2:.*]] = load i32 +// CHECK: %[[STEP2:.*]] = load i32 + +llvm.func @collapse_wsloop_dynamic( + %0: i32, %1: i32, %2: i32, + %3: i32, %4: i32, %5: i32, + %6: i32, %7: i32, %8: i32, + %20: !llvm.ptr) { + omp.parallel { + // CHECK: icmp slt i32 %[[LB0]], 0 + // CHECK-COUNT-4: select + // CHECK: %[[TRIPCOUNT0:.*]] = select + // CHECK: br label %[[PREHEADER:.*]] + // + // CHECK: [[PREHEADER]]: + // CHECK: icmp slt i32 %[[LB1]], 0 + // CHECK-COUNT-4: select + // CHECK: %[[TRIPCOUNT1:.*]] = select + // CHECK: icmp slt i32 %[[LB2]], 0 + // CHECK-COUNT-4: select + // CHECK: %[[TRIPCOUNT2:.*]] = select + // CHECK: %[[PROD:.*]] = mul nuw i32 %[[TRIPCOUNT0]], %[[TRIPCOUNT1]] + // CHECK: %[[TOTAL:.*]] = mul nuw i32 %[[PROD]], %[[TRIPCOUNT2]] + // CHECK: br label %[[COLLAPSED_PREHEADER:.*]] + // + // CHECK: [[COLLAPSED_PREHEADER]]: + // CHECK: store i32 1, i32* + // CHECK: store i32 %[[TOTAL]], i32* + // CHECK: call void @__kmpc_dispatch_init_4u + omp.wsloop (%arg0, %arg1, %arg2) : i32 = (%0, %1, %2) to (%3, %4, %5) step (%6, %7, %8) collapse(3) schedule(dynamic) { + %31 = llvm.load %20 : !llvm.ptr + %32 = llvm.add %31, %arg0 : i32 + %33 = llvm.add %32, %arg1 : i32 + %34 = llvm.add %33, %arg2 : i32 + llvm.store %34, %20 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// ----- + +// CHECK-LABEL: @omp_ordered_clause_para +llvm.func @omp_ordered_clause_para(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64, + %arg4: i64, %arg5: i64, %arg6: i64, %arg7 : i64, %arg8 : i64) -> () { + // CHECK: [[DIMS:%.*]] = alloca [2 x [[KMPDIM:%.*]]], align 8 + omp.wsloop (%arg) : i32 = (%arg0) to (%arg1) step (%arg2) ordered(2) doacross(%arg3 : i64, %arg4 : i64, %arg5 : i64, %arg6 : i64, %arg7 : i64, %arg8 : i64) { + // CHECK: omp_loop.preheader: + // CHECK: [[ADDR0:%.*]] = getelementptr inbounds [2 x [[KMPDIM]]], [2 x [[KMPDIM]]]* [[DIMS]], i64 0, i64 0 + // CHECK: [[LB0:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR0]], i32 0, i32 0 + // CHECK: store i64 [[ARG3:%.*]], i64* [[LB0]], align 8 + // CHECK: [[UB0:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR0]], i32 0, i32 1 + // CHECK: store i64 [[ARG4:%.*]], i64* [[UB0]], align 8 + // CHECK: [[STEP0:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR0]], i32 0, i32 2 + // CHECK: store i64 [[ARG5:%.*]], i64* [[STEP0]], align 8 + // CHECK: [[ADDR1:%.*]] = getelementptr inbounds [2 x [[KMPDIM]]], [2 x [[KMPDIM]]]* [[DIMS]], i64 0, i64 1 + // CHECK: [[LB1:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR1]], i32 0, i32 0 + // CHECK: store i64 [[ARG6:%.*]], i64* [[LB1:%.*]], align 8 + // CHECK: [[UB1:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR1]], i32 0, i32 1 + // CHECK: store i64 [[ARG7:%.*]], i64* [[UB1:%.*]], align 8 + // CHECK: [[STEP1:%.*]] = getelementptr inbounds [[KMPDIM]], [[KMPDIM]]* [[ADDR1]], i32 0, i32 2 + // CHECK: store i64 [[ARG8:%.*]], i64* [[STEP1:%.*]], align 8 + // CHECK: [[BASE:%.*]] = getelementptr inbounds [2 x [[KMPDIM]]], [2 x [[KMPDIM]]]* %dims, i64 0, i64 0 + // CHECK: [[BASEI8:%.*]] = bitcast [[KMPDIM]]* [[BASE]] to i8* + // CHECK: [[OMP_THREAD:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) + // CHECK: call void @__kmpc_doacross_init(%struct.ident_t* @[[GLOB1]], i32 [[OMP_THREAD]], i32 2, i8* [[BASEI8]]) + // CHECK: omp_loop.exit: + // CHECK: call void @__kmpc_doacross_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_THREAD]]) + omp.yield + } + + llvm.return +} +// ----- + +// CHECK-LABEL: @omp_ordered_construct +llvm.func @omp_ordered_construct(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64, + %arg4: i64, %arg5: i64, %arg6: i64, %arg8 : i64, %arg9 : i64, %arg10 : i64, + %arg11 : i64, %arg12 : i64, %arg13 : i64) -> () { // CHECK: [[ADDR9:%.*]] = alloca [2 x i64], align 8 // CHECK: [[ADDR7:%.*]] = alloca [2 x i64], align 8 // CHECK: [[ADDR5:%.*]] = alloca [2 x i64], align 8 @@ -657,7 +751,7 @@ omp.yield } - omp.wsloop (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) ordered(1) { + omp.wsloop (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) ordered(1) doacross(%arg8 : i64, %arg9 : i64, %arg10 : i64) { // CHECK: [[TMP:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[ADDR]], i64 0, i64 0 // CHECK: store i64 [[ARG0:%.*]], i64* [[TMP]], align 4 // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[ADDR]], i64 0, i64 0 @@ -675,7 +769,7 @@ omp.yield } - omp.wsloop (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) ordered(2) { + omp.wsloop (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) ordered(2) doacross(%arg8 : i64, %arg9 : i64, %arg10 : i64, %arg11 : i64, %arg12 : i64, %arg13 : i64) { // CHECK: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[ADDR5]], i64 0, i64 0 // CHECK: store i64 [[ARG0]], i64* [[TMP5]], align 4 // CHECK: [[TMP6:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[ADDR5]], i64 0, i64 1 @@ -779,10 +873,10 @@ // CHECK: [[EXIT]]: // CHECK: call void @__kmpc_for_static_fini({{.*}}) - // CHECK: call void @__kmpc_barrier({{.*}}) // CHECK: br label %[[AFTER:.*]] // CHECK: [[AFTER]]: + // CHECK: call void @__kmpc_barrier({{.*}}) // CHECK: br label %[[END:.*]] // CHECK: [[END]]: