Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -44,6 +44,34 @@ class CodeGenModule; class CGOpenMPRuntime { +public: + /// \brief Iterator through codegen kinds for combined directives. + typedef ArrayRef::iterator combined_iterator; + /// \brief Iterator range for codegen kinds for combined directives. + class CombinedCodeGenKind { + combined_iterator begin_iterator, end_iterator; + + public: + explicit CombinedCodeGenKind() + : begin_iterator(nullptr), end_iterator(nullptr) {} + CombinedCodeGenKind(combined_iterator begin_iterator, + combined_iterator end_iterator) + : begin_iterator(std::move(begin_iterator)), + end_iterator(std::move(end_iterator)) {} + /// \brief Advance current codegen kind for the next codegen kind in the + /// list. + void next() { + assert(begin_iterator != end_iterator); + ++begin_iterator; + } + OpenMPDirectiveKind operator*() const { + assert(begin_iterator != end_iterator); + return *begin_iterator; + } + explicit operator bool() const { return begin_iterator != end_iterator; } + }; + +private: enum OpenMPRTLFunction { /// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, /// kmpc_micro microtask, ...); @@ -284,14 +312,20 @@ virtual ~CGOpenMPRuntime() {} virtual void clear(); - /// \brief Emits outlined function for the specified OpenMP directive \a D. - /// This outlined function has type void(*)(kmp_int32 *ThreadID, kmp_int32 - /// BoundID, struct context_vars*). + /// \brief Emits outlined function for the specified OpenMP parallel directive + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). /// \param D OpenMP directive. /// \param ThreadIDVar Variable for thread id in the current OpenMP region. - /// - virtual llvm::Value *emitOutlinedFunction(const OMPExecutableDirective &D, - const VarDecl *ThreadIDVar); + /// \param CodeGenKind Points to current codegen kind in a list of + /// types of codegen kinds that must be used for combined directives. For + /// example, if this list includes OMPD_parallel and OMPD_for kinds, then the + /// outlined function for 'parallel' region must be generated that includes + /// the code for 'for' region, generated for the specified directive \a D. + virtual llvm::Value * + emitParallelOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + CombinedCodeGenKind CodeGenKind); /// \brief Emits outlined function for the OpenMP task directive \a D. This /// outlined function has type void(*)(kmp_int32 ThreadID, kmp_int32 @@ -517,6 +551,29 @@ const OMPExecutableDirective &D); ~InlinedOpenMPRegionRAII(); }; + +/// \brief RAII for emitting code of combined OpenMP constructs like '#pragma +/// omp parallel for' or '#pragma omp parallel sections' that implicitly include +/// several OpenMP regions. For example, 'parallel for' directive implicitly +/// includes 'parallel' region with inner 'for' region. +class CombinedOpenMPRegionRAII { + CodeGenFunction &CGF; + bool IsOuterCombinedRegion; + +public: + /// \brief Constructs region for combined constructs. + /// \param CodeGenKind Points to current codegen kind in a list of + /// types of codegen kinds that must be used for combined directives. For + /// example, if this list includes OMPD_parallel and OMPD_for kinds, then the + /// outlined function for 'parallel' region must be generated that includes + /// the code for 'for' region, generated for the specified directive \a D. + CombinedOpenMPRegionRAII(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + CGOpenMPRuntime::CombinedCodeGenKind CodeGenKind); + /// \brief Return current codegen kind for combined construct. + CGOpenMPRuntime::CombinedCodeGenKind getCurrentCodeGenKind() const; + ~CombinedOpenMPRegionRAII(); +}; } // namespace CodeGen } // namespace clang Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -31,11 +31,29 @@ /// \brief Base class for handling code generation inside OpenMP regions. class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo { public: - CGOpenMPRegionInfo(const OMPExecutableDirective &D, const CapturedStmt &CS) - : CGCapturedStmtInfo(CS, CR_OpenMP), Directive(D) {} + /// \brief Kinds of OpenMP regions used in codegen. + enum CGOpenMPRegionKind { + /// \brief Region with outlined function for standalone 'parallel' + /// directive. + ParallelOutlinedRegion, + /// \brief Region with outlined function for standalone 'task' directive. + TaskOutlinedRegion, + /// \brief Region for constructs that do not require function outlining, + /// like 'for', 'sections', 'atomic' etc. directives. + InlinedRegion, + /// \brief Region for combined directives, like 'parallel for' or 'parallel + /// sections'. + CombinedRegion, + }; - CGOpenMPRegionInfo(const OMPExecutableDirective &D) - : CGCapturedStmtInfo(CR_OpenMP), Directive(D) {} + CGOpenMPRegionInfo(const OMPExecutableDirective &D, const CapturedStmt &CS, + const CGOpenMPRegionKind RegionKind) + : CGCapturedStmtInfo(CS, CR_OpenMP), Directive(D), + RegionKind(RegionKind) {} + + CGOpenMPRegionInfo(const OMPExecutableDirective &D, + const CGOpenMPRegionKind RegionKind) + : CGCapturedStmtInfo(CR_OpenMP), Directive(D), RegionKind(RegionKind) {} /// \brief Get a variable or parameter for storing global thread id /// inside OpenMP construct. @@ -48,12 +66,15 @@ /// \brief Emit the captured statement body. virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) override; + CGOpenMPRegionKind getRegionKind() const { return RegionKind; } + static bool classof(const CGCapturedStmtInfo *Info) { return Info->getKind() == CR_OpenMP; } protected: /// \brief OpenMP executable directive associated with the region. const OMPExecutableDirective &Directive; + CGOpenMPRegionKind RegionKind; }; /// \brief API for captured statement code generation in OpenMP constructs. @@ -61,7 +82,8 @@ public: CGOpenMPOutlinedRegionInfo(const OMPExecutableDirective &D, const CapturedStmt &CS, const VarDecl *ThreadIDVar) - : CGOpenMPRegionInfo(D, CS), ThreadIDVar(ThreadIDVar) { + : CGOpenMPRegionInfo(D, CS, ParallelOutlinedRegion), + ThreadIDVar(ThreadIDVar) { assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region."); } /// \brief Get a variable or parameter for storing global thread id @@ -69,9 +91,19 @@ virtual const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; } + + /// \brief Emit the captured statement body. + virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) override; + /// \brief Get the name of the capture helper. StringRef getHelperName() const override { return ".omp_outlined."; } + static bool classof(const CGCapturedStmtInfo *Info) { + return CGOpenMPRegionInfo::classof(Info) && + cast(Info)->getRegionKind() == + ParallelOutlinedRegion; + } + private: /// \brief A variable or parameter storing global thread id for OpenMP /// constructs. @@ -85,7 +117,7 @@ const CapturedStmt &CS, const VarDecl *ThreadIDVar, const VarDecl *PartIDVar) - : CGOpenMPRegionInfo(D, CS), ThreadIDVar(ThreadIDVar), + : CGOpenMPRegionInfo(D, CS, TaskOutlinedRegion), ThreadIDVar(ThreadIDVar), PartIDVar(PartIDVar) { assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region."); } @@ -104,6 +136,12 @@ /// \brief Get the name of the capture helper. StringRef getHelperName() const override { return ".omp_outlined."; } + static bool classof(const CGCapturedStmtInfo *Info) { + return CGOpenMPRegionInfo::classof(Info) && + cast(Info)->getRegionKind() == + TaskOutlinedRegion; + } + private: /// \brief A variable or parameter storing global thread id for OpenMP /// constructs. @@ -119,7 +157,7 @@ public: CGOpenMPInlinedRegionInfo(const OMPExecutableDirective &D, CodeGenFunction::CGCapturedStmtInfo *OldCSI) - : CGOpenMPRegionInfo(D), OldCSI(OldCSI), + : CGOpenMPRegionInfo(D, InlinedRegion), OldCSI(OldCSI), OuterRegionInfo(dyn_cast_or_null(OldCSI)) {} // \brief Retrieve the value of the context parameter. virtual llvm::Value *getContextValue() const override { @@ -127,6 +165,13 @@ return OuterRegionInfo->getContextValue(); llvm_unreachable("No context value for inlined OpenMP region"); } + virtual void setContextValue(llvm::Value *V) override { + if (OuterRegionInfo) { + OuterRegionInfo->setContextValue(V); + return; + } + llvm_unreachable("No context value for inlined OpenMP region"); + } /// \brief Lookup the captured field decl for a variable. virtual const FieldDecl *lookup(const VarDecl *VD) const override { if (OuterRegionInfo) @@ -154,11 +199,63 @@ CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; } + static bool classof(const CGCapturedStmtInfo *Info) { + return CGOpenMPRegionInfo::classof(Info) && + cast(Info)->getRegionKind() == InlinedRegion; + } + private: /// \brief CodeGen info about outer OpenMP region. CodeGenFunction::CGCapturedStmtInfo *OldCSI; CGOpenMPRegionInfo *OuterRegionInfo; }; + +/// \brief API for code generation of combined OpenMP constructs. +class CGOpenMPCombinedRegionInfo : public CGOpenMPInlinedRegionInfo { +public: + CGOpenMPCombinedRegionInfo(const OMPExecutableDirective &D, + CodeGenFunction::CGCapturedStmtInfo *OldCSI, + CGOpenMPRuntime::CombinedCodeGenKind CodeGenKind) + : CGOpenMPInlinedRegionInfo(D, OldCSI), CodeGenKind(CodeGenKind) { + RegionKind = CombinedRegion; + } + /// \brief Make current combined region to emit code for the next OpenMP + /// region. + void nextCodeGenKind(); + /// \brief Get current codegen kind for the combined region. + CGOpenMPRuntime::CombinedCodeGenKind getCurrentCodeGenKind() const { + return CodeGenKind; + } + /// \brief Check if the current combined region is used for codegen of the \a + /// D directive. + bool isRegionForDirective(const OMPExecutableDirective &D) const { + return &D == &Directive; + } + /// \brief Emit the captured statement body for combined directives. + virtual void EmitBody(CodeGenFunction &CGF, const Stmt *) override { + // If current codegen kind points to some specific kind - use code emission + // for the directive again to emit the code for the remaining parts of the + // combined construct. + assert (*getCurrentCodeGenKind() != OMPD_unknown); + CGF.EmitStmt(&Directive); + } + + /// \brief Get the name of the capture helper. + virtual StringRef getHelperName() const override { + if (auto *OuterRegionInfo = getOldCSI()) + return OuterRegionInfo->getHelperName(); + llvm_unreachable("No helper name for combined construct"); + } + + static bool classof(const CGCapturedStmtInfo *Info) { + return CGOpenMPRegionInfo::classof(Info) && + cast(Info)->getRegionKind() == CombinedRegion; + } + +private: + /// \brief Current codegen kind. + CGOpenMPRuntime::CombinedCodeGenKind CodeGenKind; +}; } // namespace LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) { @@ -183,6 +280,13 @@ CGCapturedStmtInfo::EmitBody(CGF, S); } +void CGOpenMPOutlinedRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt *S) { + CGOpenMPRegionInfo::EmitBody(CGF, S); + // Emit implicit barrier at the end of parallel region. + CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, Directive.getLocStart(), + OMPD_parallel); +} + LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue( CodeGenFunction &CGF) { return CGF.MakeNaturalAlignAddrLValue( @@ -198,6 +302,10 @@ CGCapturedStmtInfo::EmitBody(CGF, S); } +void CGOpenMPCombinedRegionInfo::nextCodeGenKind() { + CodeGenKind.next(); +} + CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) : CGM(CGM), DefaultOpenMPPSource(nullptr), KmpRoutineEntryPtrTy(nullptr) { IdentTy = llvm::StructType::create( @@ -216,14 +324,21 @@ } llvm::Value * -CGOpenMPRuntime::emitOutlinedFunction(const OMPExecutableDirective &D, - const VarDecl *ThreadIDVar) { +CGOpenMPRuntime::emitParallelOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + CombinedCodeGenKind CodeGenKind) { assert(ThreadIDVar->getType()->isPointerType() && "thread id variable must be of type kmp_int32 *"); const CapturedStmt *CS = cast(D.getAssociatedStmt()); CodeGenFunction CGF(CGM, true); CGOpenMPOutlinedRegionInfo CGInfo(D, *CS, ThreadIDVar); CGF.CapturedStmtInfo = &CGInfo; + if (CodeGenKind) { + // We're emitting combined region with implicit 'parallel' region. + assert(*CodeGenKind == OMPD_parallel); + CombinedOpenMPRegionRAII Region(CGF, D, CodeGenKind); + return CGF.GenerateCapturedStmtFunction(*CS); + } return CGF.GenerateCapturedStmtFunction(*CS); } @@ -1536,3 +1651,40 @@ CGF.CapturedStmtInfo = OldCSI; } +CombinedOpenMPRegionRAII::CombinedOpenMPRegionRAII( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + CGOpenMPRuntime::CombinedCodeGenKind CodeGenKind) + : CGF(CGF), IsOuterCombinedRegion(false) { + assert(CodeGenKind); + if (auto *CRI = + dyn_cast_or_null(CGF.CapturedStmtInfo)) { + if (CRI->isRegionForDirective(D)) { + // We're already emitting code for the specified combined construct - + // advance to the next codegen kind. + CRI->nextCodeGenKind(); + return; + } + } + // Start emission for the combined construct. + CGF.CapturedStmtInfo = + new CGOpenMPCombinedRegionInfo(D, CGF.CapturedStmtInfo, CodeGenKind); + IsOuterCombinedRegion = true; +} + +CGOpenMPRuntime::CombinedCodeGenKind +CombinedOpenMPRegionRAII::getCurrentCodeGenKind() const { + return cast(CGF.CapturedStmtInfo) + ->getCurrentCodeGenKind(); +} + +CombinedOpenMPRegionRAII::~CombinedOpenMPRegionRAII() { + // Restore original CapturedStmtInfo only if we're done with code emission for + // the combined directive. + if (IsOuterCombinedRegion) { + auto *OldCSI = + cast(CGF.CapturedStmtInfo)->getOldCSI(); + delete CGF.CapturedStmtInfo; + CGF.CapturedStmtInfo = OldCSI; + } +} + Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -225,8 +225,8 @@ } /// \brief Emits code for OpenMP parallel directive in the parallel region. -static void EmitOMPParallelCall(CodeGenFunction &CGF, - const OMPParallelDirective &S, +static void emitOMPParallelCall(CodeGenFunction &CGF, + const OMPExecutableDirective &S, llvm::Value *OutlinedFn, llvm::Value *CapturedStruct) { if (auto C = S.getSingleClause(/*K*/ OMPC_num_threads)) { @@ -241,22 +241,30 @@ CapturedStruct); } -void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { +static void emitCommonOMPParallelDirective( + CodeGenFunction &CGF, const OMPExecutableDirective &S, + CGOpenMPRuntime::CombinedCodeGenKind CodeGenKind) { auto CS = cast(S.getAssociatedStmt()); - auto CapturedStruct = GenerateCapturedStmtArgument(*CS); - auto OutlinedFn = CGM.getOpenMPRuntime().emitOutlinedFunction( - S, *CS->getCapturedDecl()->param_begin()); + auto CapturedStruct = CGF.GenerateCapturedStmtArgument(*CS); + auto OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction( + S, *CS->getCapturedDecl()->param_begin(), CodeGenKind); if (auto C = S.getSingleClause(/*K*/ OMPC_if)) { auto Cond = cast(C)->getCondition(); - EmitOMPIfClause(*this, Cond, [&](bool ThenBlock) { + EmitOMPIfClause(CGF, Cond, [&](bool ThenBlock) { if (ThenBlock) - EmitOMPParallelCall(*this, S, OutlinedFn, CapturedStruct); + emitOMPParallelCall(CGF, S, OutlinedFn, CapturedStruct); else - CGM.getOpenMPRuntime().emitSerialCall(*this, S.getLocStart(), - OutlinedFn, CapturedStruct); + CGF.CGM.getOpenMPRuntime().emitSerialCall(CGF, S.getLocStart(), + OutlinedFn, CapturedStruct); }); } else - EmitOMPParallelCall(*this, S, OutlinedFn, CapturedStruct); + emitOMPParallelCall(CGF, S, OutlinedFn, CapturedStruct); +} + +void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { + // Emit parallel region as a standalone region. + emitCommonOMPParallelDirective(*this, S, + CGOpenMPRuntime::CombinedCodeGenKind()); } void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &S, @@ -952,9 +960,27 @@ }, S.getLocStart()); } -void -CodeGenFunction::EmitOMPParallelForDirective(const OMPParallelForDirective &) { - llvm_unreachable("CodeGen for 'omp parallel for' is not supported yet."); +void CodeGenFunction::EmitOMPParallelForDirective( + const OMPParallelForDirective &S) { + // Emit directive as a combined directive that consists of two implicit + // directives: 'parallel' with 'for' directive. + OpenMPDirectiveKind CodeGenKinds[] = {OMPD_parallel, OMPD_for}; + auto CodeGenKind = CGOpenMPRuntime::CombinedCodeGenKind( + std::begin(CodeGenKinds), std::end(CodeGenKinds)); + CombinedOpenMPRegionRAII Region(*this, S, CodeGenKind); + CodeGenKind = Region.getCurrentCodeGenKind(); + // At first emit an outlined function for 'parallel' region. + if (*CodeGenKind == OMPD_parallel) { + emitCommonOMPParallelDirective(*this, S, CodeGenKind); + return; + } + // Emit implicit worksharing loop construct. + assert(*CodeGenKind == OMPD_for); + EmitOMPWorksharingLoop(S); + // Emit implicit barrier at the end of parallel region, but this barrier is at + // the end of 'for' directive, so emit it as the implicit barrier for this + // 'for' directive. + CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(), OMPD_for); } void CodeGenFunction::EmitOMPParallelForSimdDirective( Index: lib/CodeGen/CodeGenFunction.h =================================================================== --- lib/CodeGen/CodeGenFunction.h +++ lib/CodeGen/CodeGenFunction.h @@ -192,7 +192,7 @@ CapturedRegionKind getKind() const { return Kind; } - void setContextValue(llvm::Value *V) { ThisValue = V; } + virtual void setContextValue(llvm::Value *V) { ThisValue = V; } // \brief Retrieve the value of the context parameter. virtual llvm::Value *getContextValue() const { return ThisValue; } Index: test/OpenMP/parallel_codegen.cpp =================================================================== --- test/OpenMP/parallel_codegen.cpp +++ test/OpenMP/parallel_codegen.cpp @@ -39,7 +39,7 @@ // CHECK: [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0 // CHECK-NEXT: store i32* {{%[a-z0-9.]+}}, i32** [[ARGC_REF]] // CHECK-NEXT: [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8* -// CHECK-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]]) +// CHECK-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]]) // CHECK-NEXT: [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}} // CHECK-NEXT: [[RET:%.+]] = call {{[a-z]*[ ]?i32}} [[TMAIN:@.+tmain.+]](i8** [[ARGV]]) // CHECK-NEXT: ret i32 [[RET]] @@ -55,13 +55,13 @@ // CHECK-DEBUG-NEXT: [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4 // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC1]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]] // CHECK-DEBUG-NEXT: [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8* -// CHECK-DEBUG-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]]) +// CHECK-DEBUG-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]]) // CHECK-DEBUG-NEXT: [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}} // CHECK-DEBUG-NEXT: [[RET:%.+]] = call i32 [[TMAIN:@.+tmain.+]](i8** [[ARGV]]) // CHECK-DEBUG-NEXT: ret i32 [[RET]] // CHECK-DEBUG-NEXT: } -// CHECK-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context) +// CHECK: define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context) // CHECK: #[[FN_ATTRS:[0-9]+]] // CHECK: [[CONTEXT_ADDR:%.+]] = alloca %struct.anon* // CHECK: store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]] @@ -70,11 +70,12 @@ // CHECK-NEXT: [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]] // CHECK-NEXT: [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]] // CHECK-NEXT: invoke void [[FOO:@.+foo.+]](i32{{[ ]?[a-z]*}} [[ARGC]]) +// CHECK: call {{.+}} @__kmpc_cancel_barrier( // CHECK: ret void // CHECK: call void @{{.+terminate.*|abort}}( // CHECK-NEXT: unreachable // CHECK-NEXT: } -// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context) +// CHECK-DEBUG: define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context) // CHECK-DEBUG: #[[FN_ATTRS:[0-9]+]] // CHECK-DEBUG: [[CONTEXT_ADDR:%.+]] = alloca %struct.anon* // CHECK-DEBUG: store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]] @@ -83,6 +84,7 @@ // CHECK-DEBUG-NEXT: [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]] // CHECK-DEBUG-NEXT: [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]] // CHECK-DEBUG-NEXT: invoke void [[FOO:@.+foo.+]](i32 [[ARGC]]) +// CHECK-DEBUG: call {{.+}} @__kmpc_cancel_barrier( // CHECK-DEBUG: ret void // CHECK-DEBUG: call void @{{.+terminate.*|abort}}( // CHECK-DEBUG-NEXT: unreachable @@ -98,7 +100,7 @@ // CHECK: [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0 // CHECK-NEXT: store i8*** {{%[a-z0-9.]+}}, i8**** [[ARGC_REF]] // CHECK-NEXT: [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8* -// CHECK-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]]) +// CHECK-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]]) // CHECK-NEXT: ret i32 0 // CHECK-NEXT: } // CHECK-DEBUG: define linkonce_odr i32 [[TMAIN]](i8** %argc) @@ -112,11 +114,11 @@ // CHECK-DEBUG-NEXT: [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4 // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]] // CHECK-DEBUG-NEXT: [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8* -// CHECK-DEBUG-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]]) +// CHECK-DEBUG-NEXT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]]) // CHECK-DEBUG-NEXT: ret i32 0 // CHECK-DEBUG-NEXT: } -// CHECK-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context) +// CHECK: define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context) // CHECK: [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0* // CHECK: store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]] // CHECK: [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]] @@ -124,11 +126,12 @@ // CHECK-NEXT: [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]] // CHECK-NEXT: [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]] // CHECK-NEXT: invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]]) +// CHECK: call {{.+}} @__kmpc_cancel_barrier( // CHECK: ret void // CHECK: call void @{{.+terminate.*|abort}}( // CHECK-NEXT: unreachable // CHECK-NEXT: } -// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context) +// CHECK-DEBUG: define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context) // CHECK-DEBUG: [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0* // CHECK-DEBUG: store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]] // CHECK-DEBUG: [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]] @@ -136,6 +139,7 @@ // CHECK-DEBUG-NEXT: [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]] // CHECK-DEBUG-NEXT: [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]] // CHECK-DEBUG-NEXT: invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]]) +// CHECK-DEBUG: call {{.+}} @__kmpc_cancel_barrier( // CHECK-DEBUG: ret void // CHECK-DEBUG: call void @{{.+terminate.*|abort}}( // CHECK-DEBUG-NEXT: unreachable Index: test/OpenMP/parallel_for_codegen.cpp =================================================================== --- test/OpenMP/parallel_for_codegen.cpp +++ test/OpenMP/parallel_for_codegen.cpp @@ -0,0 +1,398 @@ +// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp=libiomp5 -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG +// +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* } +// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void without_schedule_clause(float *a, float *b, float *c, float *d) { + #pragma omp parallel for +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1) +// UB = min(UB, GlobalUB) +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423 +// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]] +// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ] +// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]] +// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]] +// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]] +// Loop header +// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (int i = 33; i < 32000000; i += 7) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7 +// CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 33, [[CALC_I_1]] +// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1 +// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]]) +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void static_not_chunked(float *a, float *b, float *c, float *d) { + #pragma omp parallel for schedule(static) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1) +// UB = min(UB, GlobalUB) +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423 +// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]] +// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ] +// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]] +// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]] +// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]] +// Loop header +// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (int i = 32000000; i > 33; i += -7) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7 +// CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]] +// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1 +// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]]) +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void static_chunked(float *a, float *b, float *c, float *d) { + #pragma omp parallel for schedule(static, 5) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5) +// UB = min(UB, GlobalUB) +// CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288 +// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]] +// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ] +// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]] +// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]] +// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]] + +// Outer loop header +// CHECK: [[O_IV:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[O_UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]] +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (unsigned i = 131071; i <= 2147483647; i += 127) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127 +// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]] +// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1 +// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// Update the counters, adding stride +// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]] +// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]] +// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]] +// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]] +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]] +// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]] +// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]] + +// CHECK: [[O_LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]]) +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void dynamic1(float *a, float *b, float *c, float *d) { + #pragma omp parallel for schedule(dynamic) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1) +// +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]]) +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0 +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]] +// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]] +// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]] + +// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (unsigned long long i = 131071; i < 2147483647; i += 127) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127 +// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]] +// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1 +// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[O_LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void guided7(float *a, float *b, float *c, float *d) { + #pragma omp parallel for schedule(guided, 7) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 36, i64 0, i64 16908287, i64 1, i64 7) +// +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]]) +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0 +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]] +// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]] +// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]] + +// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (unsigned long long i = 131071; i < 2147483647; i += 127) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127 +// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]] +// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1 +// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[O_LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void test_auto(float *a, float *b, float *c, float *d) { + unsigned int x = 0; + unsigned int y = 0; + #pragma omp parallel for schedule(auto) collapse(2) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1) +// +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]]) +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0 +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]] +// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]] +// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]] + +// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] +// FIXME: When the iteration count of some nested loop is not a known constant, +// we should pre-calculate it, like we do for the total number of iterations! + for (char i = static_cast(y); i <= '9'; ++i) + for (x = 11; x > 0; --x) { +// CHECK: [[LOOP1_BODY]] +// Start of body: indices are calculated from IV: +// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}} +// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}} +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1 +// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[O_LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void runtime(float *a, float *b, float *c, float *d) { + int x = 0; + #pragma omp parallel for collapse(2) schedule(runtime) +// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}}) +// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}}) +// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]], +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 37, i32 0, i32 199, i32 1, i32 1) +// +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]]) +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0 +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]] +// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]] +// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]] + +// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (unsigned char i = '0' ; i <= '9'; ++i) + for (x = -10; x < 10; ++x) { +// CHECK: [[LOOP1_BODY]] +// Start of body: indices are calculated from IV: +// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}} +// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}} +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1 +// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// CHECK: [[O_LOOP1_END]] +// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]], +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + +// TERM_DEBUG-LABEL: foo +int foo() {return 0;}; + +// TERM_DEBUG-LABEL: parallel_for +void parallel_for(float *a) { +#pragma omp parallel for schedule(static, 5) + // TERM_DEBUG-NOT: __kmpc_global_thread_num + // TERM_DEBUG: call void @__kmpc_for_static_init_4u({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]] + // TERM_DEBUG: invoke i32 {{.*}}foo{{.*}}() + // TERM_DEBUG: unwind label %[[TERM_LPAD:.+]], + // TERM_DEBUG-NOT: __kmpc_global_thread_num + // TERM_DEBUG: call void @__kmpc_for_static_fini({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]] + // TERM_DEBUG: call {{.+}} @__kmpc_cancel_barrier({{.+}}), !dbg [[DBG_LOC_CANCEL:![0-9]+]] + // TERM_DEBUG: [[TERM_LPAD]] + // TERM_DEBUG: call void @__clang_call_terminate + // TERM_DEBUG: unreachable + for (unsigned i = 131071; i <= 2147483647; i += 127) + a[i] += foo(); +} +// Check source line corresponds to "#pragma omp parallel for schedule(static, 5)" above: +// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !MDLocation(line: [[@LINE-15]], +// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !MDLocation(line: [[@LINE-16]], +// TERM_DEBUG-DAG: [[DBG_LOC_CANCEL]] = !MDLocation(line: [[@LINE-17]], + +#endif // HEADER + Index: test/OpenMP/sections_codegen.cpp =================================================================== --- test/OpenMP/sections_codegen.cpp +++ test/OpenMP/sections_codegen.cpp @@ -96,6 +96,7 @@ // CHECK-NEXT: br label %[[END]] // CHECK: [[END]] // CHECK-NEXT: call i32 @__kmpc_cancel_barrier(%{{.+}}* [[IMPLICIT_BARRIER_SINGLE_LOC]], +// CHECK-NEXT: call i32 @__kmpc_cancel_barrier( // CHECK-NEXT: ret // CHECK: [[TERM_LPAD]] // CHECK: call void @__clang_call_terminate(i8*