diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -241,11 +241,22 @@ if (const Expr *E = TG->getReductionRef()) CGF.EmitVarDecl(*cast(cast(E)->getDecl())); } + // Temp copy arrays for inscan reductions should not be emitted as they are + // not used in simd only mode. + llvm::DenseSet> CopyArrayTemps; + for (const auto *C : S.getClausesOfKind()) { + if (C->getModifier() != OMPC_REDUCTION_inscan) + continue; + for (const Expr *E : C->copy_array_temps()) + CopyArrayTemps.insert(cast(E)->getDecl()); + } const auto *CS = cast_or_null(S.getAssociatedStmt()); while (CS) { for (auto &C : CS->captures()) { if (C.capturesVariable() || C.capturesVariableByCopy()) { auto *VD = C.getCapturedVar(); + if (CopyArrayTemps.contains(VD)) + continue; assert(VD == VD->getCanonicalDecl() && "Canonical decl must be captured."); DeclRefExpr DRE(CGF.getContext(), const_cast(VD), @@ -3295,53 +3306,30 @@ return {LBVal, UBVal}; } -/// Emits the code for the directive with inscan reductions. +/// Emits internal temp array declarations for the directive with inscan +/// reductions. /// The code is the following: /// \code /// size num_iters = ; /// buffer[num_iters]; -/// #pragma omp ... -/// for (i: 0..) { -/// ; -/// buffer[i] = red; -/// } -/// for (int k = 0; k != ceil(log2(num_iters)); ++k) -/// for (size cnt = last_iter; cnt >= pow(2, k); --k) -/// buffer[i] op= buffer[i-pow(2,k)]; -/// #pragma omp ... -/// for (0..) { -/// red = InclusiveScan ? buffer[i] : buffer[i-1]; -/// ; -/// } /// \endcode -static void emitScanBasedDirective( +static void emitScanBasedDirectiveDecls( CodeGenFunction &CGF, const OMPLoopDirective &S, - llvm::function_ref NumIteratorsGen, - llvm::function_ref FirstGen, - llvm::function_ref SecondGen) { + llvm::function_ref NumIteratorsGen) { llvm::Value *OMPScanNumIterations = CGF.Builder.CreateIntCast( NumIteratorsGen(CGF), CGF.SizeTy, /*isSigned=*/false); SmallVector Shareds; SmallVector Privates; SmallVector ReductionOps; - SmallVector LHSs; - SmallVector RHSs; - SmallVector CopyOps; SmallVector CopyArrayTemps; - SmallVector CopyArrayElems; for (const auto *C : S.getClausesOfKind()) { assert(C->getModifier() == OMPC_REDUCTION_inscan && "Only inscan reductions are expected."); Shareds.append(C->varlist_begin(), C->varlist_end()); Privates.append(C->privates().begin(), C->privates().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); - LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); - RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); - CopyOps.append(C->copy_ops().begin(), C->copy_ops().end()); CopyArrayTemps.append(C->copy_array_temps().begin(), C->copy_array_temps().end()); - CopyArrayElems.append(C->copy_array_elems().begin(), - C->copy_array_elems().end()); } { // Emit buffers for each reduction variables. @@ -3370,6 +3358,49 @@ ++Count; } } +} + +/// Emits the code for the directive with inscan reductions. +/// The code is the following: +/// \code +/// #pragma omp ... +/// for (i: 0..) { +/// ; +/// buffer[i] = red; +/// } +/// #pragma omp master // in parallel region +/// for (int k = 0; k != ceil(log2(num_iters)); ++k) +/// for (size cnt = last_iter; cnt >= pow(2, k); --k) +/// buffer[i] op= buffer[i-pow(2,k)]; +/// #pragma omp barrier // in parallel region +/// #pragma omp ... +/// for (0..) { +/// red = InclusiveScan ? buffer[i] : buffer[i-1]; +/// ; +/// } +/// \endcode +static void emitScanBasedDirective( + CodeGenFunction &CGF, const OMPLoopDirective &S, + llvm::function_ref NumIteratorsGen, + llvm::function_ref FirstGen, + llvm::function_ref SecondGen) { + llvm::Value *OMPScanNumIterations = CGF.Builder.CreateIntCast( + NumIteratorsGen(CGF), CGF.SizeTy, /*isSigned=*/false); + SmallVector Privates; + SmallVector ReductionOps; + SmallVector LHSs; + SmallVector RHSs; + SmallVector CopyArrayElems; + for (const auto *C : S.getClausesOfKind()) { + assert(C->getModifier() == OMPC_REDUCTION_inscan && + "Only inscan reductions are expected."); + Privates.append(C->privates().begin(), C->privates().end()); + ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); + LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); + RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); + CopyArrayElems.append(C->copy_array_elems().begin(), + C->copy_array_elems().end()); + } CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S); { // Emit loop with input phase: @@ -3382,90 +3413,108 @@ CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF); FirstGen(CGF); } - // Emit prefix reduction: - // for (int k = 0; k <= ceil(log2(n)); ++k) - llvm::BasicBlock *InputBB = CGF.Builder.GetInsertBlock(); - llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.outer.log.scan.body"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.outer.log.scan.exit"); - llvm::Function *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::log2, CGF.DoubleTy); - llvm::Value *Arg = - CGF.Builder.CreateUIToFP(OMPScanNumIterations, CGF.DoubleTy); - llvm::Value *LogVal = CGF.EmitNounwindRuntimeCall(F, Arg); - F = CGF.CGM.getIntrinsic(llvm::Intrinsic::ceil, CGF.DoubleTy); - LogVal = CGF.EmitNounwindRuntimeCall(F, LogVal); - LogVal = CGF.Builder.CreateFPToUI(LogVal, CGF.IntTy); - llvm::Value *NMin1 = CGF.Builder.CreateNUWSub( - OMPScanNumIterations, llvm::ConstantInt::get(CGF.SizeTy, 1)); - auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getBeginLoc()); - CGF.EmitBlock(LoopBB); - auto *Counter = CGF.Builder.CreatePHI(CGF.IntTy, 2); - // size pow2k = 1; - auto *Pow2K = CGF.Builder.CreatePHI(CGF.SizeTy, 2); - Counter->addIncoming(llvm::ConstantInt::get(CGF.IntTy, 0), InputBB); - Pow2K->addIncoming(llvm::ConstantInt::get(CGF.SizeTy, 1), InputBB); - // for (size i = n - 1; i >= 2 ^ k; --i) - // tmp[i] op= tmp[i-pow2k]; - llvm::BasicBlock *InnerLoopBB = - CGF.createBasicBlock("omp.inner.log.scan.body"); - llvm::BasicBlock *InnerExitBB = - CGF.createBasicBlock("omp.inner.log.scan.exit"); - llvm::Value *CmpI = CGF.Builder.CreateICmpUGE(NMin1, Pow2K); - CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); - CGF.EmitBlock(InnerLoopBB); - auto *IVal = CGF.Builder.CreatePHI(CGF.SizeTy, 2); - IVal->addIncoming(NMin1, LoopBB); - { - CodeGenFunction::OMPPrivateScope PrivScope(CGF); - auto *ILHS = LHSs.begin(); - auto *IRHS = RHSs.begin(); - for (const Expr *CopyArrayElem : CopyArrayElems) { - const auto *LHSVD = cast(cast(*ILHS)->getDecl()); - const auto *RHSVD = cast(cast(*IRHS)->getDecl()); - Address LHSAddr = Address::invalid(); - { - CodeGenFunction::OpaqueValueMapping IdxMapping( - CGF, - cast( - cast(CopyArrayElem)->getIdx()), - RValue::get(IVal)); - LHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF); - } - PrivScope.addPrivate(LHSVD, [LHSAddr]() { return LHSAddr; }); - Address RHSAddr = Address::invalid(); - { - llvm::Value *OffsetIVal = CGF.Builder.CreateNUWSub(IVal, Pow2K); - CodeGenFunction::OpaqueValueMapping IdxMapping( - CGF, - cast( - cast(CopyArrayElem)->getIdx()), - RValue::get(OffsetIVal)); - RHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF); + // #pragma omp barrier // in parallel region + auto &&CodeGen = [&S, OMPScanNumIterations, &LHSs, &RHSs, &CopyArrayElems, + &ReductionOps, + &Privates](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + // Emit prefix reduction: + // #pragma omp master // in parallel region + // for (int k = 0; k <= ceil(log2(n)); ++k) + llvm::BasicBlock *InputBB = CGF.Builder.GetInsertBlock(); + llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.outer.log.scan.body"); + llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.outer.log.scan.exit"); + llvm::Function *F = + CGF.CGM.getIntrinsic(llvm::Intrinsic::log2, CGF.DoubleTy); + llvm::Value *Arg = + CGF.Builder.CreateUIToFP(OMPScanNumIterations, CGF.DoubleTy); + llvm::Value *LogVal = CGF.EmitNounwindRuntimeCall(F, Arg); + F = CGF.CGM.getIntrinsic(llvm::Intrinsic::ceil, CGF.DoubleTy); + LogVal = CGF.EmitNounwindRuntimeCall(F, LogVal); + LogVal = CGF.Builder.CreateFPToUI(LogVal, CGF.IntTy); + llvm::Value *NMin1 = CGF.Builder.CreateNUWSub( + OMPScanNumIterations, llvm::ConstantInt::get(CGF.SizeTy, 1)); + auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getBeginLoc()); + CGF.EmitBlock(LoopBB); + auto *Counter = CGF.Builder.CreatePHI(CGF.IntTy, 2); + // size pow2k = 1; + auto *Pow2K = CGF.Builder.CreatePHI(CGF.SizeTy, 2); + Counter->addIncoming(llvm::ConstantInt::get(CGF.IntTy, 0), InputBB); + Pow2K->addIncoming(llvm::ConstantInt::get(CGF.SizeTy, 1), InputBB); + // for (size i = n - 1; i >= 2 ^ k; --i) + // tmp[i] op= tmp[i-pow2k]; + llvm::BasicBlock *InnerLoopBB = + CGF.createBasicBlock("omp.inner.log.scan.body"); + llvm::BasicBlock *InnerExitBB = + CGF.createBasicBlock("omp.inner.log.scan.exit"); + llvm::Value *CmpI = CGF.Builder.CreateICmpUGE(NMin1, Pow2K); + CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + CGF.EmitBlock(InnerLoopBB); + auto *IVal = CGF.Builder.CreatePHI(CGF.SizeTy, 2); + IVal->addIncoming(NMin1, LoopBB); + { + CodeGenFunction::OMPPrivateScope PrivScope(CGF); + auto *ILHS = LHSs.begin(); + auto *IRHS = RHSs.begin(); + for (const Expr *CopyArrayElem : CopyArrayElems) { + const auto *LHSVD = cast(cast(*ILHS)->getDecl()); + const auto *RHSVD = cast(cast(*IRHS)->getDecl()); + Address LHSAddr = Address::invalid(); + { + CodeGenFunction::OpaqueValueMapping IdxMapping( + CGF, + cast( + cast(CopyArrayElem)->getIdx()), + RValue::get(IVal)); + LHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF); + } + PrivScope.addPrivate(LHSVD, [LHSAddr]() { return LHSAddr; }); + Address RHSAddr = Address::invalid(); + { + llvm::Value *OffsetIVal = CGF.Builder.CreateNUWSub(IVal, Pow2K); + CodeGenFunction::OpaqueValueMapping IdxMapping( + CGF, + cast( + cast(CopyArrayElem)->getIdx()), + RValue::get(OffsetIVal)); + RHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF); + } + PrivScope.addPrivate(RHSVD, [RHSAddr]() { return RHSAddr; }); + ++ILHS; + ++IRHS; } - PrivScope.addPrivate(RHSVD, [RHSAddr]() { return RHSAddr; }); - ++ILHS; - ++IRHS; + PrivScope.Privatize(); + CGF.CGM.getOpenMPRuntime().emitReduction( + CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, + {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); } - PrivScope.Privatize(); - CGF.CGM.getOpenMPRuntime().emitReduction( - CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); - } - llvm::Value *NextIVal = - CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); - IVal->addIncoming(NextIVal, CGF.Builder.GetInsertBlock()); - CmpI = CGF.Builder.CreateICmpUGE(NextIVal, Pow2K); - CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); - CGF.EmitBlock(InnerExitBB); - llvm::Value *Next = - CGF.Builder.CreateNUWAdd(Counter, llvm::ConstantInt::get(CGF.IntTy, 1)); - Counter->addIncoming(Next, CGF.Builder.GetInsertBlock()); - // pow2k <<= 1; - llvm::Value *NextPow2K = CGF.Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true); - Pow2K->addIncoming(NextPow2K, CGF.Builder.GetInsertBlock()); - llvm::Value *Cmp = CGF.Builder.CreateICmpNE(Next, LogVal); - CGF.Builder.CreateCondBr(Cmp, LoopBB, ExitBB); - auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc()); - CGF.EmitBlock(ExitBB); + llvm::Value *NextIVal = + CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); + IVal->addIncoming(NextIVal, CGF.Builder.GetInsertBlock()); + CmpI = CGF.Builder.CreateICmpUGE(NextIVal, Pow2K); + CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + CGF.EmitBlock(InnerExitBB); + llvm::Value *Next = + CGF.Builder.CreateNUWAdd(Counter, llvm::ConstantInt::get(CGF.IntTy, 1)); + Counter->addIncoming(Next, CGF.Builder.GetInsertBlock()); + // pow2k <<= 1; + llvm::Value *NextPow2K = + CGF.Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true); + Pow2K->addIncoming(NextPow2K, CGF.Builder.GetInsertBlock()); + llvm::Value *Cmp = CGF.Builder.CreateICmpNE(Next, LogVal); + CGF.Builder.CreateCondBr(Cmp, LoopBB, ExitBB); + auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc()); + CGF.EmitBlock(ExitBB); + }; + if (isOpenMPParallelDirective(S.getDirectiveKind())) { + CGF.CGM.getOpenMPRuntime().emitMasterRegion(CGF, CodeGen, S.getBeginLoc()); + CGF.CGM.getOpenMPRuntime().emitBarrierCall( + CGF, S.getBeginLoc(), OMPD_unknown, /*EmitChecks=*/false, + /*ForceSimpleCall=*/true); + } else { + RegionCodeGenTy RCG(CodeGen); + RCG(CGF); + } CGF.OMPFirstScanLoop = false; SecondGen(CGF); @@ -3502,6 +3551,8 @@ emitForLoopBounds, emitDispatchForLoopBounds); }; + if (!isOpenMPParallelDirective(S.getDirectiveKind())) + emitScanBasedDirectiveDecls(CGF, S, NumIteratorsGen); emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen); } else { CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, S.getDirectiveKind(), @@ -3955,6 +4006,19 @@ (void)emitWorksharingDirective(CGF, S, S.hasCancel()); }; { + if (llvm::any_of(S.getClausesOfKind(), + [](const OMPReductionClause *C) { + return C->getModifier() == OMPC_REDUCTION_inscan; + })) { + const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) { + CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF); + CGCapturedStmtInfo CGSI(CR_OpenMP); + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGSI); + OMPLoopScope LoopScope(CGF, S); + return CGF.EmitScalarExpr(S.getNumIterations()); + }; + emitScanBasedDirectiveDecls(*this, S, NumIteratorsGen); + } auto LPCRegion = CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); emitCommonOMPParallelDirective(*this, S, OMPD_for, CodeGen, @@ -3973,6 +4037,19 @@ (void)emitWorksharingDirective(CGF, S, /*HasCancel=*/false); }; { + if (llvm::any_of(S.getClausesOfKind(), + [](const OMPReductionClause *C) { + return C->getModifier() == OMPC_REDUCTION_inscan; + })) { + const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) { + CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF); + CGCapturedStmtInfo CGSI(CR_OpenMP); + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGSI); + OMPLoopScope LoopScope(CGF, S); + return CGF.EmitScalarExpr(S.getNumIterations()); + }; + emitScanBasedDirectiveDecls(*this, S, NumIteratorsGen); + } auto LPCRegion = CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); emitCommonOMPParallelDirective(*this, S, OMPD_for_simd, CodeGen, diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -4596,6 +4596,17 @@ } } } + if (ThisCaptureRegion == OMPD_parallel) { + // Capture temp arrays for inscan reductions. + for (OMPClause *C : Clauses) { + if (auto *RC = dyn_cast(C)) { + if (RC->getModifier() != OMPC_REDUCTION_inscan) + continue; + for (Expr *E : RC->copy_array_temps()) + MarkDeclarationsReferencedInExpr(E); + } + } + } if (++CompletedRegions == CaptureRegions.size()) DSAStack->setBodyComplete(); SR = ActOnCapturedRegionEnd(SR.get()); diff --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp --- a/clang/test/OpenMP/parallel_for_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp @@ -10,7 +10,7 @@ #ifndef HEADER #define HEADER -void foo(); +void foo(int n); void bar(); // CHECK: define{{.*}} void @{{.*}}baz{{.*}}(i32 %n) @@ -18,10 +18,16 @@ static float a[10]; static double b; - // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( + // CHECK: call i8* @llvm.stacksave() + // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] + + // float a_buffer[10][n]; + // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]], + // double b_buffer[10]; + // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, + // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( - // CHECK: call i8* @llvm.stacksave() // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] // float a_buffer[10][n]; @@ -29,6 +35,9 @@ // double b_buffer[10]; // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, + // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( + // CHECK: call void @llvm.stackrestore(i8* + #pragma omp parallel for reduction(inscan, +:a[:n], b) for (int i = 0; i < 10; ++i) { // CHECK: call void @__kmpc_for_static_init_4( @@ -37,13 +46,13 @@ // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]], // CHECK: br label %[[DISPATCH:[^,]+]] // CHECK: [[INPUT_PHASE:.+]]: - // CHECK: call void @{{.+}}foo{{.+}}() + // CHECK: call void @{{.+}}foo{{.+}}( // a_buffer[i][0..n] = a_priv[[0..n]; // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 - // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]] + // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]] // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8* @@ -51,7 +60,7 @@ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -62,7 +71,7 @@ // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_barrier( - foo(); + foo(n); #pragma omp scan inclusive(a[:n], b) // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01) // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]]) @@ -128,7 +137,7 @@ // CHECK: br label %[[DISPATCH:[^,]+]] // Skip the before scan body. - // CHECK: call void @{{.+}}foo{{.+}}() + // CHECK: call void @{{.+}}foo{{.+}}( // CHECK: [[EXIT_INSCAN:[^,]+]]: // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -158,17 +167,8 @@ // CHECK: [[LOOP_CONTINUE]]: // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( - // CHECK: call void @llvm.stackrestore(i8* } - // CHECK: call i8* @llvm.stacksave() - // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] - - // float a_buffer[10][n]; - // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]], - - // double b_buffer[10]; - // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, #pragma omp parallel for reduction(inscan, +:a[:n], b) for (int i = 0; i < 10; ++i) { // CHECK: call void @__kmpc_for_static_init_4( @@ -178,15 +178,15 @@ // CHECK: br label %[[DISPATCH:[^,]+]] // Skip the before scan body. - // CHECK: call void @{{.+}}foo{{.+}}() + // CHECK: call void @{{.+}}foo{{.+}}( // CHECK: [[EXIT_INSCAN:[^,]+]]: // a_buffer[i][0..n] = a_priv[[0..n]; // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 - // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]] + // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]] // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8* @@ -194,7 +194,7 @@ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -210,7 +210,7 @@ // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_barrier( - foo(); + foo(n); #pragma omp scan exclusive(a[:n], b) // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01) // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]]) @@ -276,7 +276,7 @@ // CHECK: br label %[[DISPATCH:[^,]+]] // CHECK: [[SCAN_PHASE:.+]]: - // CHECK: call void @{{.+}}foo{{.+}}() + // CHECK: call void @{{.+}}foo{{.+}}( // CHECK: br label %[[LOOP_CONTINUE:.+]] // CHECK: [[DISPATCH]]: @@ -305,7 +305,6 @@ // CHECK: [[LOOP_CONTINUE]]: // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( - // CHECK: call void @llvm.stackrestore(i8* } } diff --git a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp --- a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp +++ b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp @@ -18,10 +18,15 @@ static float a[10]; static double b; - // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( + // CHECK: call i8* @llvm.stacksave() + // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] + + // float a_buffer[10][n]; + // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]], + // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, + // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( - // CHECK: call i8* @llvm.stacksave() // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] // float a_buffer[10][n]; @@ -29,6 +34,9 @@ // double b_buffer[10]; // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, + // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( + // CHECK: call void @llvm.stackrestore(i8* + #pragma omp parallel for simd reduction(inscan, +:a[:n], b) for (int i = 0; i < 10; ++i) { // CHECK: call void @__kmpc_for_static_init_4( @@ -42,8 +50,8 @@ // a_buffer[i][0..n] = a_priv[[0..n]; // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 - // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]] + // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]] // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8* @@ -51,7 +59,7 @@ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:.+]] @@ -158,17 +166,8 @@ // CHECK: [[LOOP_CONTINUE]]: // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( - // CHECK: call void @llvm.stackrestore(i8* } - // CHECK: call i8* @llvm.stacksave() - // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]] - - // float a_buffer[10][n]; - // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]], - - // double b_buffer[10]; - // CHECK: [[B_BUF:%.+]] = alloca double, i64 10, #pragma omp parallel for simd reduction(inscan, +:a[:n], b) for (int i = 0; i < 10; ++i) { // CHECK: call void @__kmpc_for_static_init_4( @@ -185,8 +184,8 @@ // a_buffer[i][0..n] = a_priv[[0..n]; // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]], // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64 - // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]] - // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]] + // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]] + // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]] // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0 // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4 // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8* @@ -194,7 +193,7 @@ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false) // b_buffer[i] = b_priv; - // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]] + // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]] // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]], // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]], // CHECK: br label %[[LOOP_CONTINUE:[^,]+]] @@ -305,7 +304,6 @@ // CHECK: [[LOOP_CONTINUE]]: // CHECK: call void @llvm.stackrestore(i8* % // CHECK: call void @__kmpc_for_static_fini( - // CHECK: call void @llvm.stackrestore(i8* } }