diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -6275,6 +6275,15 @@ OMPC_nontemporal, SourceLocation(), SourceLocation(), SourceLocation(), N) {} + /// Get the list of privatied copies if the member expression was captured by + /// one of the privatization clauses. + MutableArrayRef getPrivateRefs() { + return MutableArrayRef(varlist_end(), varlist_size()); + } + ArrayRef getPrivateRefs() const { + return llvm::makeArrayRef(varlist_end(), varlist_size()); + } + public: /// Creates clause with a list of variables \a VL. /// @@ -6293,6 +6302,10 @@ /// \param N The number of variables. static OMPNontemporalClause *CreateEmpty(const ASTContext &C, unsigned N); + /// Sets the list of references to private copies created in private clauses. + /// \param VL List of references. + void setPrivateRefs(ArrayRef VL); + child_range children() { return child_range(reinterpret_cast(varlist_begin()), reinterpret_cast(varlist_end())); @@ -6303,6 +6316,16 @@ return const_child_range(Children.begin(), Children.end()); } + child_range private_refs() { + return child_range(reinterpret_cast(getPrivateRefs().begin()), + reinterpret_cast(getPrivateRefs().end())); + } + + const_child_range private_refs() const { + auto Children = const_cast(this)->private_refs(); + return const_child_range(Children.begin(), Children.end()); + } + child_range used_children() { return child_range(child_iterator(), child_iterator()); } diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -3378,6 +3378,9 @@ bool RecursiveASTVisitor::VisitOMPNontemporalClause( OMPNontemporalClause *C) { TRY_TO(VisitOMPClauseList(C)); + for (auto *E : C->private_refs()) { + TRY_TO(TraverseStmt(E)); + } return true; } diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1162,8 +1162,8 @@ SourceLocation LParenLoc, SourceLocation EndLoc, ArrayRef VL) { - // Allocate space for nontemporal variables. - void *Mem = C.Allocate(totalSizeToAlloc(VL.size())); + // Allocate space for nontemporal variables + private references. + void *Mem = C.Allocate(totalSizeToAlloc(2 * VL.size())); auto *Clause = new (Mem) OMPNontemporalClause(StartLoc, LParenLoc, EndLoc, VL.size()); Clause->setVarRefs(VL); @@ -1172,10 +1172,16 @@ OMPNontemporalClause *OMPNontemporalClause::CreateEmpty(const ASTContext &C, unsigned N) { - void *Mem = C.Allocate(totalSizeToAlloc(N)); + void *Mem = C.Allocate(totalSizeToAlloc(2 * N)); return new (Mem) OMPNontemporalClause(N); } +void OMPNontemporalClause::setPrivateRefs(ArrayRef VL) { + assert(VL.size() == varlist_size() && "Number of private references is not " + "the same as the preallocated buffer"); + std::copy(VL.begin(), VL.end(), varlist_end()); +} + //===----------------------------------------------------------------------===// // OpenMP clauses printing methods //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -769,10 +769,13 @@ const OMPIsDevicePtrClause *C) { VisitOMPClauseList(C); } -void OMPClauseProfiler::VisitOMPNontemporalClause(const OMPNontemporalClause *C) { +void OMPClauseProfiler::VisitOMPNontemporalClause( + const OMPNontemporalClause *C) { VisitOMPClauseList(C); + for (auto *E : C->private_refs()) + Profiler->VisitStmt(E); } -} +} // namespace void StmtProfiler::VisitOMPExecutableDirective(const OMPExecutableDirective *S) { diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2566,21 +2566,35 @@ VD = VD->getCanonicalDecl(); if (auto *FD = LambdaCaptureFields.lookup(VD)) return EmitCapturedFieldLValue(*this, FD, CXXABIThisValue); - else if (CapturedStmtInfo) { + if (CapturedStmtInfo) { auto I = LocalDeclMap.find(VD); if (I != LocalDeclMap.end()) { + LValue CapLVal; if (VD->getType()->isReferenceType()) - return EmitLoadOfReferenceLValue(I->second, VD->getType(), - AlignmentSource::Decl); - return MakeAddrLValue(I->second, T); + CapLVal = EmitLoadOfReferenceLValue(I->second, VD->getType(), + AlignmentSource::Decl); + else + CapLVal = MakeAddrLValue(I->second, T); + // Mark lvalue as nontemporal if the variable is marked as nontemporal + // in simd context. + if (getLangOpts().OpenMP && + CGM.getOpenMPRuntime().isNontemporalDecl(VD)) + CapLVal.setNontemporal(/*Value=*/true); + return CapLVal; } LValue CapLVal = EmitCapturedFieldLValue(*this, CapturedStmtInfo->lookup(VD), CapturedStmtInfo->getContextValue()); - return MakeAddrLValue( + CapLVal = MakeAddrLValue( Address(CapLVal.getPointer(*this), getContext().getDeclAlign(VD)), CapLVal.getType(), LValueBaseInfo(AlignmentSource::Decl), CapLVal.getTBAAInfo()); + // Mark lvalue as nontemporal if the variable is marked as nontemporal + // in simd context. + if (getLangOpts().OpenMP && + CGM.getOpenMPRuntime().isNontemporalDecl(VD)) + CapLVal.setNontemporal(/*Value=*/true); + return CapLVal; } assert(isa(CurCodeDecl)); @@ -3929,6 +3943,15 @@ if (auto *Field = dyn_cast(ND)) { LValue LV = EmitLValueForField(BaseLV, Field); setObjCGCLValueClass(getContext(), E, LV); + if (getLangOpts().OpenMP) { + // If the member was explicitly marked as nontemporal, mark it as + // nontemporal. If the base lvalue is marked as nontemporal, mark access + // to children as nontemporal too. + if ((IsWrappedCXXThis(BaseExpr) && + CGM.getOpenMPRuntime().isNontemporalDecl(Field)) || + BaseLV.isNontemporal()) + LV.setNontemporal(/*Value=*/true); + } return LV; } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -211,6 +211,16 @@ ~DisableAutoDeclareTargetRAII(); }; + /// Manages list of nontemporal decls for the specified directive. + class NontemporalDeclsRAII { + CodeGenModule &CGM; + const bool NeedToPush; + + public: + NontemporalDeclsRAII(CodeGenModule &CGM, const OMPLoopDirective &S); + ~NontemporalDeclsRAII(); + }; + protected: CodeGenModule &CGM; StringRef FirstSeparator, Separator; @@ -650,6 +660,11 @@ std::pair> DeferredVariantFunction; + using NontemporalDeclsSet = llvm::SmallDenseSet>; + /// Stack for list of declarations in current context marked as nontemporal. + /// The set is the union of all current stack elements. + llvm::SmallVector NontemporalDeclsStack; + /// Flag for keeping track of weather a requires unified_shared_memory /// directive is present. bool HasRequiresUnifiedSharedMemory = false; @@ -1663,6 +1678,10 @@ /// Emits the definition of the declare variant function. virtual bool emitDeclareVariant(GlobalDecl GD, bool IsForDefinition); + + /// Checks if the \p VD variable is marked as nontemporal declaration in + /// current context. + bool isNontemporalDecl(const ValueDecl *VD) const; }; /// Class supports emissionof SIMD-only code. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -17,6 +17,7 @@ #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/Decl.h" +#include "clang/AST/OpenMPClause.h" #include "clang/AST/StmtOpenMP.h" #include "clang/Basic/BitmaskEnum.h" #include "clang/CodeGen/ConstantInitBuilder.h" @@ -11341,6 +11342,46 @@ return true; } +CGOpenMPRuntime::NontemporalDeclsRAII::NontemporalDeclsRAII( + CodeGenModule &CGM, const OMPLoopDirective &S) + : CGM(CGM), NeedToPush(S.hasClausesOfKind()) { + assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode."); + if (!NeedToPush) + return; + NontemporalDeclsSet &DS = + CGM.getOpenMPRuntime().NontemporalDeclsStack.emplace_back(); + for (const auto *C : S.getClausesOfKind()) { + for (const Stmt *Ref : C->private_refs()) { + const auto *SimpleRefExpr = cast(Ref)->IgnoreParenImpCasts(); + const ValueDecl *VD; + if (const auto *DRE = dyn_cast(SimpleRefExpr)) { + VD = DRE->getDecl(); + } else { + const auto *ME = cast(SimpleRefExpr); + assert((ME->isImplicitCXXThis() || + isa(ME->getBase()->IgnoreParenImpCasts())) && + "Expected member of current class."); + VD = ME->getMemberDecl(); + } + DS.insert(VD); + } + } +} + +CGOpenMPRuntime::NontemporalDeclsRAII::~NontemporalDeclsRAII() { + if (!NeedToPush) + return; + CGM.getOpenMPRuntime().NontemporalDeclsStack.pop_back(); +} + +bool CGOpenMPRuntime::isNontemporalDecl(const ValueDecl *VD) const { + assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode."); + + return llvm::any_of( + CGM.getOpenMPRuntime().NontemporalDeclsStack, + [VD](const NontemporalDeclsSet &Set) { return Set.count(VD) > 0; }); +} + llvm::Function *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1803,8 +1803,9 @@ static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S, const RegionCodeGenTy &SimdInitGen, const RegionCodeGenTy &BodyCodeGen) { - auto &&ThenGen = [&SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF, - PrePostActionTy &) { + auto &&ThenGen = [&S, &SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF, + PrePostActionTy &) { + CGOpenMPRuntime::NontemporalDeclsRAII NontemporalsRegion(CGF.CGM, S); CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF); SimdInitGen(CGF); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2230,6 +2230,11 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack, ArrayRef Clauses); +static std::pair +getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, + SourceRange &ERange, bool AllowArraySection = false); +static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr, + bool WithInit); void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { // OpenMP [2.14.3.5, Restrictions, C/C++, p.1] @@ -2274,6 +2279,31 @@ } } Clause->setPrivateCopies(PrivateCopies); + continue; + } + // Finalize nontemporal clause by handling private copies, if any. + if (auto *Clause = dyn_cast(C)) { + SmallVector PrivateRefs; + for (Expr *RefExpr : Clause->varlists()) { + assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); + SourceLocation ELoc; + SourceRange ERange; + Expr *SimpleRefExpr = RefExpr; + auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + if (Res.second) + // It will be analyzed later. + PrivateRefs.push_back(RefExpr); + ValueDecl *D = Res.first; + if (!D) + continue; + + const DSAStackTy::DSAVarData DVar = + DSAStack->getTopDSA(D, /*FromParent=*/false); + PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy + : SimpleRefExpr); + } + Clause->setPrivateRefs(PrivateRefs); + continue; } } // Check allocate clauses. @@ -4262,9 +4292,10 @@ return ErrorFound; } -static std::pair -getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, - SourceRange &ERange, bool AllowArraySection = false) { +static std::pair getPrivateItem(Sema &S, Expr *&RefExpr, + SourceLocation &ELoc, + SourceRange &ERange, + bool AllowArraySection) { if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() || RefExpr->containsUnexpandedParameterPack()) return std::make_pair(nullptr, true); @@ -17172,8 +17203,6 @@ if (!D) continue; - auto *VD = dyn_cast(D); - // OpenMP 5.0, 2.9.3.1 simd Construct, Restrictions. // A list-item cannot appear in more than one nontemporal clause. if (const Expr *PrevRef = @@ -17185,12 +17214,7 @@ continue; } - DeclRefExpr *Ref = nullptr; - if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext()) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); - Vars.push_back((VD || !Ref || CurContext->isDependentContext()) - ? RefExpr->IgnoreParens() - : Ref); + Vars.push_back(RefExpr); } if (Vars.empty()) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12466,4 +12466,9 @@ for (unsigned i = 0; i != NumVars; ++i) Vars.push_back(Record.readSubExpr()); C->setVarRefs(Vars); + Vars.clear(); + Vars.reserve(NumVars); + for (unsigned i = 0; i != NumVars; ++i) + Vars.push_back(Record.readSubExpr()); + C->setPrivateRefs(Vars); } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6544,4 +6544,6 @@ Record.AddSourceLocation(C->getLParenLoc()); for (auto *VE : C->varlists()) Record.AddStmt(VE); + for (auto *E : C->private_refs()) + Record.AddStmt(E); } diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp --- a/clang/test/OpenMP/distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_codegen.cpp @@ -143,7 +143,7 @@ #pragma omp target #pragma omp teams #ifdef OMP5 - #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true) + #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true) nontemporal(a, b) #else #pragma omp distribute simd dist_schedule(static) safelen(32) #endif // OMP5 @@ -189,6 +189,11 @@ // CHECK: [[BBINNBODY]]: // CHECK: {{.+}} = load i32, i32* [[IV]] // ... loop body ... +// OMP45-NOT: !nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// OMP50-NOT: !nontemporal + // CHECK: br label %[[BBBODYCONT:.+]] // CHECK: [[BBBODYCONT]]: // CHECK: br label %[[BBINNINC:.+]] @@ -271,7 +276,7 @@ #pragma omp target #pragma omp teams #ifdef OMP5 - #pragma omp distribute simd linear(i) if(a) + #pragma omp distribute simd linear(i) if(a) nontemporal(i) #else #pragma omp distribute simd linear(i) #endif // OMP5 @@ -293,6 +298,9 @@ // CHECK: br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]] // CHECK: [[PRECOND_THEN]] // CHECK: call void @__kmpc_for_static_init_4 +// OMP45-NOT: !nontemporal +// OMP50: store i8 {{.*}}!nontemporal +// OMP50-NOT: !nontemporal // CHECK: call void @__kmpc_for_static_fini // CHECK: [[PRECOND_END]] diff --git a/clang/test/OpenMP/for_simd_codegen.cpp b/clang/test/OpenMP/for_simd_codegen.cpp --- a/clang/test/OpenMP/for_simd_codegen.cpp +++ b/clang/test/OpenMP/for_simd_codegen.cpp @@ -333,7 +333,7 @@ // OMP50: br i1 [[COND]], label {{%?}}[[THEN:[^,]+]], label {{%?}}[[ELSE:[^,]+]] // OMP50: [[THEN]]: #ifdef OMP5 - #pragma omp for simd reduction(*:R) if (simd:A) + #pragma omp for simd reduction(*:R) if (simd:A) nontemporal(R) #else #pragma omp for simd reduction(*:R) #endif @@ -366,7 +366,8 @@ // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i64 -10, [[LC_IT_1]] // CHECK-NEXT: store i64 [[LC_IT_2]], i64* [[LC:%[^,]+]], // CHECK-NEXT: [[LC_VAL:%.+]] = load i64, i64* [[LC]] -// CHECK: store i32 %{{.+}}, i32* [[R_PRIV]], +// OMP45: store i32 %{{.+}}, i32* [[R_PRIV]], +// OMP50: store i32 %{{.+}}, i32* [[R_PRIV]],{{.*}}!nontemporal R *= i; // CHECK: [[IV8_2:%.+]] = load i64, i64* [[OMP_IV8]] // CHECK-NEXT: [[ADD8_2:%.+]] = add nsw i64 [[IV8_2]], 1 diff --git a/clang/test/OpenMP/simd_codegen.cpp b/clang/test/OpenMP/simd_codegen.cpp --- a/clang/test/OpenMP/simd_codegen.cpp +++ b/clang/test/OpenMP/simd_codegen.cpp @@ -22,10 +22,15 @@ long long get_val() { return 0; } double *g_ptr; +struct S { + int a, b; +}; + // CHECK-LABEL: define {{.*void}} @{{.*}}simple{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) void simple(float *a, float *b, float *c, float *d) { + S s, *p; #ifdef OMP5 - #pragma omp simd if (simd: true) + #pragma omp simd if (simd: true) nontemporal(a, b, c, d, s) #else #pragma omp simd #endif @@ -43,8 +48,17 @@ // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]{{.*}}!llvm.access.group // ... loop body ... // End of body: store into a[i]: +// OMP45-NOT: load float*,{{.*}}!nontemporal +// CHECK-NOT: load float,{{.*}}!nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// OMP50: load i32,{{.*}}!nontemporal +// OMP50-NOT: load i32,{{.*}}!nontemporal +// OMP50: load float*,{{.*}}!nontemporal +// CHECK-NOT: load float,{{.*}}!nontemporal // CHECK: store float [[RESULT:%.+]], float* {{%.+}}{{.*}}!llvm.access.group - a[i] = b[i] * c[i] * d[i]; + a[i] = b[i] * c[i] * d[i] + s.a + p->a; // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.access.group // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]{{.*}}!llvm.access.group @@ -718,6 +732,47 @@ // } +#ifdef OMP5 +// OMP50-LABEL: inner_simd +void inner_simd() { + double a, b; +#pragma omp simd nontemporal(a) + for (int i = 0; i < 10; ++i) { +#pragma omp simd nontemporal(b) + for (int k = 0; k < 10; ++k) { + // OMP50: load double,{{.*}}!nontemporal + // OMP50: store double{{.*}}!nontemporal + a = b; + } + // OMP50-NOT: load double,{{.*}}!nontemporal + // OMP50: load double, + // OMP50: store double{{.*}}!nontemporal + a = b; + } +} + +extern struct T t; +struct Base { + float a; +}; +struct T : public Base { + void foo() { +#pragma omp simd nontemporal(Base::a) + for (int i = 0; i < 10; ++i) { + // OMP50: store float{{.*}}!nontemporal + // OMP50-NOT: nontemporal + // OMP50-NEXT: store float + Base::a = 0; + t.a = 0; + } + } +} t; + +void bartfoo() { + t.foo(); +} + +#endif // OMP5 // TERM_DEBUG-LABEL: bar int bar() {return 0;}; diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp @@ -539,7 +539,7 @@ short int c[2][n]; #ifdef OMP5 - #pragma omp target parallel for simd if(n>60) + #pragma omp target parallel for simd if(n>60) nontemporal(a) #else #pragma omp target parallel for simd if(target: n>60) #endif // OMP5 @@ -837,6 +837,9 @@ // OMP45: define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}}) // OMP50: define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}}, i[[SZ]] %{{.+}}) // To reduce complexity, we're only going as far as validating the signature of the outlined parallel function. +// OMP45-NOT: !nontemporal +// OMP50: store double{{.*}}!nontemporal +// OMP50: load double{{.*}}!nontemporal // CHECK: define internal void [[HVT6]] diff --git a/clang/test/OpenMP/target_simd_codegen.cpp b/clang/test/OpenMP/target_simd_codegen.cpp --- a/clang/test/OpenMP/target_simd_codegen.cpp +++ b/clang/test/OpenMP/target_simd_codegen.cpp @@ -85,8 +85,8 @@ // CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 800, i64 800, i64 547] // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i64] [i64 4, i64 2, i64 1, i64 40] // CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i64] [i64 800, i64 800, i64 800, i64 547] -// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547] -// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [7 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547, i64 800] +// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547] +// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547, i64 800] // CHECK-DAG: @{{.*}} = weak constant i8 0 // CHECK-DAG: @{{.*}} = weak constant i8 0 // CHECK-DAG: @{{.*}} = weak constant i8 0 @@ -461,7 +461,11 @@ int b = n+1; short int c[2][n]; - #pragma omp target simd if(n>60) +#ifdef OMP5 + #pragma omp target simd if(n>60) nontemporal(a) private(a) +#else + #pragma omp target simd if(n>60) private(a) +#endif // OMP5 for (unsigned long long it = 2000; it >= 600; it -= 400) { this->a = (double)b + 1.5; c[1][1] = ++a; @@ -519,96 +523,84 @@ // CHECK-32: [[CSZSIZE:%.+]] = mul nuw i32 [[CELEMSIZE2]], 2 // CHECK-32: [[CSIZE:%.+]] = sext i32 [[CSZSIZE]] to i64 -// OMP45-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x i64], [6 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1) -// OMP50-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 7, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([7 x i64], [7 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1) -// OMP45-DAG: [[BPR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP:%.+]], i32 0, i32 0 -// OMP45-DAG: [[PR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P:%.+]], i32 0, i32 0 -// OMP45-DAG: [[SR]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S:%.+]], i32 0, i32 0 -// OMP45-DAG: [[SADDR0:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX0:[0-9]+]] -// OMP45-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX0]] -// OMP45-DAG: [[PADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX0]] -// OMP45-DAG: [[SADDR1:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX1:[0-9]+]] -// OMP45-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX1]] -// OMP45-DAG: [[PADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX1]] -// OMP45-DAG: [[SADDR2:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX2:[0-9]+]] -// OMP45-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX2]] -// OMP45-DAG: [[PADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX2]] -// OMP45-DAG: [[SADDR3:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX3:[0-9]+]] -// OMP45-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX3]] -// OMP45-DAG: [[PADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX3]] -// OMP45-DAG: [[SADDR4:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX4:[0-9]+]] -// OMP45-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX4]] -// OMP45-DAG: [[PADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX4]] -// OMP45-DAG: [[SADDR5:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX5:[0-9]+]] -// OMP45-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX5]] -// OMP45-DAG: [[PADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX5]] -// OMP50-DAG: [[BPR]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP:%.+]], i32 0, i32 0 -// OMP50-DAG: [[PR]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P:%.+]], i32 0, i32 0 -// OMP50-DAG: [[SR]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S:%.+]], i32 0, i32 0 -// OMP50-DAG: [[SADDR0:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX0:[0-9]+]] -// OMP50-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX0]] -// OMP50-DAG: [[PADDR0:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX0]] -// OMP50-DAG: [[SADDR1:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX1:[0-9]+]] -// OMP50-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX1]] -// OMP50-DAG: [[PADDR1:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX1]] -// OMP50-DAG: [[SADDR2:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX2:[0-9]+]] -// OMP50-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX2]] -// OMP50-DAG: [[PADDR2:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX2]] -// OMP50-DAG: [[SADDR3:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX3:[0-9]+]] -// OMP50-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX3]] -// OMP50-DAG: [[PADDR3:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX3]] -// OMP50-DAG: [[SADDR4:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX4:[0-9]+]] -// OMP50-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX4]] -// OMP50-DAG: [[PADDR4:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX4]] -// OMP50-DAG: [[SADDR5:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX5:[0-9]+]] -// OMP50-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX5]] -// OMP50-DAG: [[PADDR5:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX5]] -// OMP50-DAG: [[SADDR6:%.+]] = getelementptr inbounds [7 x i64], [7 x i64]* [[S]], i32 [[IDX6:[0-9]+]] -// OMP50-DAG: [[BPADDR6:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[BP]], i32 [[IDX6]] -// OMP50-DAG: [[PADDR6:%.+]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[P]], i32 [[IDX6]] +// OMP45-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1) +// OMP50-DAG: [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x i64], [6 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1) +// OMP45-DAG: [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0 +// OMP45-DAG: [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0 +// OMP45-DAG: [[SR]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S:%.+]], i32 0, i32 0 +// OMP45-DAG: [[SADDR0:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX0:[0-9]+]] +// OMP45-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]] +// OMP45-DAG: [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]] +// OMP45-DAG: [[SADDR1:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX1:[0-9]+]] +// OMP45-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]] +// OMP45-DAG: [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]] +// OMP45-DAG: [[SADDR2:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX2:[0-9]+]] +// OMP45-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]] +// OMP45-DAG: [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]] +// OMP45-DAG: [[SADDR3:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX3:[0-9]+]] +// OMP45-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]] +// OMP45-DAG: [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]] +// OMP45-DAG: [[SADDR4:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX4:[0-9]+]] +// OMP45-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX4]] +// OMP45-DAG: [[PADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX4]] +// OMP50-DAG: [[BPR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP:%.+]], i32 0, i32 0 +// OMP50-DAG: [[PR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P:%.+]], i32 0, i32 0 +// OMP50-DAG: [[SR]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S:%.+]], i32 0, i32 0 +// OMP50-DAG: [[SADDR0:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX0:[0-9]+]] +// OMP50-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX0]] +// OMP50-DAG: [[PADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX0]] +// OMP50-DAG: [[SADDR1:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX1:[0-9]+]] +// OMP50-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX1]] +// OMP50-DAG: [[PADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX1]] +// OMP50-DAG: [[SADDR2:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX2:[0-9]+]] +// OMP50-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX2]] +// OMP50-DAG: [[PADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX2]] +// OMP50-DAG: [[SADDR3:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX3:[0-9]+]] +// OMP50-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX3]] +// OMP50-DAG: [[PADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX3]] +// OMP50-DAG: [[SADDR4:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX4:[0-9]+]] +// OMP50-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX4]] +// OMP50-DAG: [[PADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX4]] +// OMP50-DAG: [[SADDR5:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX5:[0-9]+]] +// OMP50-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX5]] +// OMP50-DAG: [[PADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX5]] // The names below are not necessarily consistent with the names used for the // addresses above as some are repeated. // CHECK-DAG: store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR0:%.+]], -// CHECK-DAG: store double* %{{.+}}, double** [[CPADDR0:%.+]], +// CHECK-DAG: store [[S1]]* %{{.+}}, [[S1]]** [[CPADDR0:%.+]], // CHECK-DAG: [[CBPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]** -// CHECK-DAG: [[CPADDR0]] = bitcast i8** {{%[^,]+}} to double** -// CHECK-DAG: store i64 %{{.+}}, i64* {{%[^,]+}} - -// CHECK-DAG: store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR1:%.+]], -// CHECK-DAG: store double* %{{.+}}, double** [[CPADDR1:%.+]], -// CHECK-DAG: [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to [[S1]]** -// CHECK-DAG: [[CPADDR1]] = bitcast i8** {{%[^,]+}} to double** +// CHECK-DAG: [[CPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]** // CHECK-DAG: store i64 {{4|8}}, i64* {{%[^,]+}} -// CHECK-DAG: store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR2:%.+]], -// CHECK-DAG: store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR2:%.+]], +// CHECK-DAG: store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR1:%.+]], +// CHECK-DAG: store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR1:%.+]], +// CHECK-DAG: [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* +// CHECK-DAG: [[CPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* +// CHECK-DAG: store i64 4, i64* {{%[^,]+}} + +// CHECK-DAG: store i[[SZ]] 2, i[[SZ]]* [[CBPADDR2:%.+]], +// CHECK-DAG: store i[[SZ]] 2, i[[SZ]]* [[CPADDR2:%.+]], // CHECK-DAG: [[CBPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* // CHECK-DAG: [[CPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* -// CHECK-DAG: store i64 4, i64* {{%[^,]+}} +// CHECK-DAG: store i64 {{4|8}}, i64* {{%[^,]+}} -// CHECK-DAG: store i[[SZ]] 2, i[[SZ]]* [[CBPADDR3:%.+]], -// CHECK-DAG: store i[[SZ]] 2, i[[SZ]]* [[CPADDR3:%.+]], +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR3:%.+]], +// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR3:%.+]], // CHECK-DAG: [[CBPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* // CHECK-DAG: [[CPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* // CHECK-DAG: store i64 {{4|8}}, i64* {{%[^,]+}} -// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR4:%.+]], -// CHECK-DAG: store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR4:%.+]], -// CHECK-DAG: [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* -// CHECK-DAG: [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* -// CHECK-DAG: store i64 {{4|8}}, i64* {{%[^,]+}} - -// CHECK-DAG: store i16* %{{.+}}, i16** [[CBPADDR5:%.+]], -// CHECK-DAG: store i16* %{{.+}}, i16** [[CPADDR5:%.+]], -// CHECK-DAG: [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i16** -// CHECK-DAG: [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i16** +// CHECK-DAG: store i16* %{{.+}}, i16** [[CBPADDR4:%.+]], +// CHECK-DAG: store i16* %{{.+}}, i16** [[CPADDR4:%.+]], +// CHECK-DAG: [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i16** +// CHECK-DAG: [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i16** // CHECK-DAG: store i64 [[CSIZE]], i64* {{%[^,]+}} -// OMP50-DAG: store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR6:%.+]], -// OMP50-DAG: store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR6:%.+]], -// OMP50-DAG: [[CBPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* -// OMP50-DAG: [[CPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* +// OMP50-DAG: store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR5:%.+]], +// OMP50-DAG: store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR5:%.+]], +// OMP50-DAG: [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* +// OMP50-DAG: [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]* // OMP50-DAG: store i64 1, i64* {{%[^,]+}} // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 @@ -738,6 +730,10 @@ // OMP50-DAG: [[CONV_COND:%.+]] = bitcast i[[SZ]]* [[LOCAL_SIMD_COND_CASTED]] to i8* // OMP50-DAG: [[SIMD_COND:%.+]] = load i8, i8* [[CONV_COND]], // OMP50-DAG: trunc i8 [[SIMD_COND]] to i1 +// OMP45-NOT: !nontemporal +// OMP50: store double {{.*}}!nontemporal +// OMP50: load double, {{.*}}!nontemporal +// OMP50: store double {{.*}}!nontemporal // CHECK: define internal void [[HVT6]] // Create local storage for each capture. diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp @@ -35,20 +35,25 @@ // CHECK: call i{{[0-9]+}} @__tgt_target_teams( // CHECK: call void [[OFFLOADING_FUN_1:@.+]]( #ifdef OMP5 -#pragma omp target teams distribute parallel for simd if(simd: true) +#pragma omp target teams distribute parallel for simd if(simd: true) nontemporal(Arg) #else #pragma omp target teams distribute parallel for simd #endif // OMP5 - for(int i = 0 ; i < 100; i++) {} + for (int i = 0; i < 100; i++) { + Arg = 0; + } // CHECK: define internal void [[OFFLOADING_FUN_0]]( - // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}}) + // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}}) // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]]( // CHECK: call void @__kmpc_for_static_init_4( - // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void + // OMP50: load i32,{{.*}}!nontemporal + // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 3, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void // CHECK: call void @__kmpc_for_static_fini( // CHECK: define{{.+}} void [[OMP_OUTLINED_0]]( // CHECK: call void @__kmpc_for_static_init_4( + // OMP45-NOT: !nontemporal + // OMP50: store i32 0,{{.*}}!nontemporal // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret #pragma omp target teams distribute parallel for simd if (parallel: false) diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp --- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp @@ -165,7 +165,7 @@ // CHECK: call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}}) #ifdef OMP5 - #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1) + #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1) nontemporal(a) #else #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) #endif // OMP5 @@ -395,6 +395,8 @@ // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], align // CHECK: store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align // CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32* +// OMP45-NOT: !nontemporal +// OMP50: load i32,{{.*}}!nontemporal // CHECK-64: store i32 10, i32* [[AA_CADDR]], align // CHECK-32: store i32 10, i32* [[AA_ADDR]], align // CHECK: ret void diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp --- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp @@ -177,16 +177,16 @@ // CK3: define {{.*}}i32 @{{.+}}foo{{.+}}( int foo(void) { - // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1) + // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1) // CK3: call void @[[OFFL1:.+]]([[SSI]]* %{{.+}}) #pragma omp target #ifdef OMP5 - #pragma omp teams distribute simd if(b) + #pragma omp teams distribute simd if(b) nontemporal(a, b) #else #pragma omp teams distribute simd #endif // OMP5 for(int i = 0; i < X; i++) { - a[i] = (T)0; + a[i] = (T)b; } // outlined target region @@ -197,6 +197,8 @@ // CK3: define internal void @[[OUTL1]]({{.+}}) // CK3: call void @__kmpc_for_static_init_4( + // OMP3_45-NOT: !nontemporal + // OMP3_50: load float,{{.*}}!nontemporal // CK3: call void @__kmpc_for_static_fini( // CK3: ret void diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2458,6 +2458,8 @@ void OMPClauseEnqueue::VisitOMPNontemporalClause( const OMPNontemporalClause *C) { VisitOMPClauseList(C); + for (const auto *E : C->private_refs()) + Visitor->AddStmt(E); } }