Index: include/polly/CodeGen/CodeGeneration.h =================================================================== --- include/polly/CodeGen/CodeGeneration.h +++ include/polly/CodeGen/CodeGeneration.h @@ -36,6 +36,8 @@ PreservedAnalyses run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &AR, SPMUpdater &U); }; + +extern bool PerfMonitoring; } // namespace polly #endif // POLLY_CODEGENERATION_H Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -168,6 +168,16 @@ "Do not add parameter bounds and do no gist simplify sets accordingly"), cl::Hidden, cl::init(false), cl::cat(PollyCategory)); +static cl::opt PollyAllowDereferenceOfAllFunctionParams( + "polly-allow-dereference-of-all-function-parameters", + cl::desc( + "Treat all parameters to functions that are pointers as dereferencible." + " This is useful for invariant load hoisting, since we can generate" + " less runtime checks. This is only valid if all pointers to functions" + " are always initialized, so that Polly can choose to hoist" + " their loads. "), + cl::Hidden, cl::init(false), cl::cat(PollyCategory)); + static cl::opt PollyPreciseFoldAccesses( "polly-precise-fold-accesses", cl::desc("Fold memory accesses to model more possible delinearizations " @@ -3827,11 +3837,23 @@ return nullptr; } +bool isAParameter(llvm::Value *maybeParam, const Function &F) { + for (const llvm::Argument &Arg : F.args()) + if (&Arg == maybeParam) + return true; + + return false; +}; + bool Scop::canAlwaysBeHoisted(MemoryAccess *MA, bool StmtInvalidCtxIsEmpty, bool MAInvalidCtxIsEmpty, bool NonHoistableCtxIsEmpty) { LoadInst *LInst = cast(MA->getAccessInstruction()); const DataLayout &DL = LInst->getParent()->getModule()->getDataLayout(); + if (PollyAllowDereferenceOfAllFunctionParams && + isAParameter(LInst->getPointerOperand(), getFunction())) + return true; + // TODO: We can provide more information for better but more expensive // results. if (!isDereferenceableAndAlignedPointer(LInst->getPointerOperand(), Index: lib/CodeGen/CodeGeneration.cpp =================================================================== --- lib/CodeGen/CodeGeneration.cpp +++ lib/CodeGen/CodeGeneration.cpp @@ -49,10 +49,12 @@ cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); -static cl::opt - PerfMonitoring("polly-codegen-perf-monitoring", - cl::desc("Add run-time performance monitoring"), cl::Hidden, - cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +bool polly::PerfMonitoring; +static cl::opt + XPerfMonitoring("polly-codegen-perf-monitoring", + cl::desc("Add run-time performance monitoring"), cl::Hidden, + cl::location(polly::PerfMonitoring), cl::init(false), + cl::ZeroOrMore, cl::cat(PollyCategory)); namespace polly { /// Mark a basic block unreachable. Index: lib/CodeGen/ManagedMemoryRewrite.cpp =================================================================== --- lib/CodeGen/ManagedMemoryRewrite.cpp +++ lib/CodeGen/ManagedMemoryRewrite.cpp @@ -124,8 +124,8 @@ Instruction *I = Cur->getAsInstruction(); assert(I && "unable to convert ConstantExpr to Instruction"); - DEBUG(dbgs() << "Expanding ConstantExpression: " << *Cur - << " | in Instruction: " << *I << "\n";); + DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur + << ") in Instruction: (" << *I << ")\n";); // Invalidate `Cur` so that no one after this point uses `Cur`. Rather, // they should mutate `I`. @@ -208,12 +208,22 @@ const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() || Array.hasInternalLinkage() || IgnoreLinkageForGlobals; - if (!OnlyVisibleInsideModule) + if (!OnlyVisibleInsideModule) { + DEBUG(dbgs() << "Not rewriting (" << Array + << ") to managed memory " + "because it could be visible externally. To force rewrite, " + "use -polly-acc-rewrite-ignore-linkage-for-globals.\n"); return; + } if (!Array.hasInitializer() || - !isa(Array.getInitializer())) + !isa(Array.getInitializer())) { + DEBUG(dbgs() << "Not rewriting (" << Array + << ") to managed memory " + "because it has an initializer which is " + "not a zeroinitializer.\n"); return; + } // At this point, we have committed to replacing this array. ReplacedGlobals.insert(&Array); @@ -278,14 +288,14 @@ auto *Alloca = dyn_cast(&I); if (!Alloca) continue; - dbgs() << "Checking if " << *Alloca << "may be captured: "; + DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: "); if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false, /* StoreCaptures */ true)) { Allocas.insert(Alloca); - DEBUG(dbgs() << "YES (captured)\n"); + DEBUG(dbgs() << "YES (captured).\n"); } else { - DEBUG(dbgs() << "NO (not captured)\n"); + DEBUG(dbgs() << "NO (not captured).\n"); } } } @@ -293,7 +303,7 @@ static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca, const DataLayout &DL) { - DEBUG(dbgs() << "rewriting: " << *Alloca << " to managed mem.\n"); + DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n"); Module *M = Alloca->getModule(); assert(M && "Alloca does not have a module"); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -16,6 +16,7 @@ #include "polly/CodeGen/CodeGeneration.h" #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/IslNodeBuilder.h" +#include "polly/CodeGen/PerfMonitor.h" #include "polly/CodeGen/Utils.h" #include "polly/DependenceInfo.h" #include "polly/LinkAllPasses.h" @@ -122,6 +123,8 @@ cl::desc("Minimal number of compute statements to run on GPU."), cl::Hidden, cl::init(10 * 512 * 512)); +extern bool polly::PerfMonitoring; + /// Return a unique name for a Scop, which is the scop region with the /// function name. std::string getUniqueScopName(const Scop *S) { @@ -1380,8 +1383,8 @@ /// A list of functions that are available in NVIDIA's libdevice. const std::set CUDALibDeviceFunctions = { - "exp", "expf", "expl", "cos", "cosf", - "sqrt", "sqrtf", "copysign", "copysignf", "copysignl"}; + "exp", "expf", "expl", "cos", "cosf", "sqrt", + "sqrtf", "copysign", "copysignf", "copysignl", "log", "logf"}; /// Return the corresponding CUDA libdevice function name for @p F. /// @@ -1406,7 +1409,7 @@ return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || - Name.startswith("llvm.copysign")); + Name.startswith("llvm.copysign") || Name.startswith("llvm.powi")); } /// Do not take `Function` as a subtree value. @@ -3433,6 +3436,22 @@ isl_ast_node_free(Root); } else { + if (polly::PerfMonitoring) { + PerfMonitor P(*S, EnteringBB->getParent()->getParent()); + P.initialize(); + P.insertRegionStart(SplitBlock->getTerminator()); + + // TODO: actually think if this is the correct exiting block to place + // the `end` performance marker. Invariant load hoisting changes + // the CFG in a way that I do not precisely understand, so I + // (Siddharth) should come back to this and + // think about which exiting block to use. + auto *ExitingBlock = StartBlock->getUniqueSuccessor(); + assert(ExitingBlock); + BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor(); + P.insertRegionEnd(MergeBlock->getTerminator()); + } + NodeBuilder.addParameters(S->getContext().release()); Value *RTC = NodeBuilder.createRTC(Condition); Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); Index: lib/Support/ISLTools.cpp =================================================================== --- lib/Support/ISLTools.cpp +++ lib/Support/ISLTools.cpp @@ -366,13 +366,6 @@ auto WriteActions = give(isl_union_map_apply_domain(Schedule.copy(), Writes.copy())); - // { [Element[] -> Scatter[] } - auto AfterReads = afterScatter(ReadActions, ReadEltInSameInst); - auto WritesBeforeAnyReads = - give(isl_union_map_subtract(WriteActions.take(), AfterReads.take())); - auto BeforeWritesBeforeAnyReads = - beforeScatter(WritesBeforeAnyReads, !IncludeWrite); - // { [Element[] -> DomainWrite[]] -> Scatter[] } auto EltDomWrites = give(isl_union_map_apply_range( isl_union_map_range_map(isl_union_map_reverse(Writes.copy())), @@ -390,15 +383,26 @@ auto ReadsOverwrittenRotated = give(isl_union_map_reverse( isl_union_map_curry(reverseDomain(ReadsOverwritten).take()))); auto LastOverwrittenRead = - give(isl_union_map_lexmax(ReadsOverwrittenRotated.take())); + give(isl_union_map_lexmax(ReadsOverwrittenRotated.copy())); // { [Element[] -> DomainWrite[]] -> Scatter[] } auto BetweenLastReadOverwrite = betweenScatter( LastOverwrittenRead, EltDomWrites, IncludeLastRead, IncludeWrite); - return give(isl_union_map_union( - BeforeWritesBeforeAnyReads.take(), - isl_union_map_domain_factor_domain(BetweenLastReadOverwrite.take()))); + // { [Element[] -> Scatter[]] -> DomainWrite[] } + isl::union_map ReachingOverwriteZone = computeReachingWrite( + Schedule, Writes, true, IncludeLastRead, IncludeWrite); + + // { [Element[] -> DomainWrite[]] -> Scatter[] } + isl::union_map ReachingOverwriteRotated = + reverseDomain(ReachingOverwriteZone).curry().reverse(); + + // { [Element[] -> DomainWrite[]] -> Scatter[] } + isl::union_map WritesWithoutReads = ReachingOverwriteRotated.subtract_domain( + ReadsOverwrittenRotated.domain()); + + return BetweenLastReadOverwrite.unite(WritesWithoutReads) + .domain_factor_domain(); } isl::union_set polly::convertZoneToTimepoints(isl::union_set Zone, Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -483,61 +483,6 @@ return Node; } -/// Get the position of a dimension with a non-zero coefficient. -/// -/// Check that isl constraint @p Constraint has only one non-zero -/// coefficient for dimensions that have type @p DimType. If this is true, -/// return the position of the dimension corresponding to the non-zero -/// coefficient and negative value, otherwise. -/// -/// @param Constraint The isl constraint to be checked. -/// @param DimType The type of the dimensions. -/// @return The position of the dimension in case the isl -/// constraint satisfies the requirements, a negative -/// value, otherwise. -static int getMatMulConstraintDim(isl::constraint Constraint, - isl::dim DimType) { - int DimPos = -1; - auto LocalSpace = Constraint.get_local_space(); - int LocalSpaceDimNum = LocalSpace.dim(DimType); - for (int i = 0; i < LocalSpaceDimNum; i++) { - auto Val = Constraint.get_coefficient_val(DimType, i); - if (Val.is_zero()) - continue; - if (DimPos >= 0 || (DimType == isl::dim::out && !Val.is_one()) || - (DimType == isl::dim::in && !Val.is_negone())) - return -1; - DimPos = i; - } - return DimPos; -} - -/// Check the form of the isl constraint. -/// -/// Check that the @p DimInPos input dimension of the isl constraint -/// @p Constraint has a coefficient that is equal to negative one, the @p -/// DimOutPos has a coefficient that is equal to one and others -/// have coefficients equal to zero. -/// -/// @param Constraint The isl constraint to be checked. -/// @param DimInPos The input dimension of the isl constraint. -/// @param DimOutPos The output dimension of the isl constraint. -/// @return isl_stat_ok in case the isl constraint satisfies -/// the requirements, isl_stat_error otherwise. -static isl_stat isMatMulOperandConstraint(isl::constraint Constraint, - int &DimInPos, int &DimOutPos) { - auto Val = Constraint.get_constant_val(); - if (!isl_constraint_is_equality(Constraint.get()) || !Val.is_zero()) - return isl_stat_error; - DimInPos = getMatMulConstraintDim(Constraint, isl::dim::in); - if (DimInPos < 0) - return isl_stat_error; - DimOutPos = getMatMulConstraintDim(Constraint, isl::dim::out); - if (DimOutPos < 0) - return isl_stat_error; - return isl_stat_ok; -} - /// Permute the two dimensions of the isl map. /// /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that @@ -585,30 +530,49 @@ /// second output dimension. /// @return True in case @p AccMap has the expected form and false, /// otherwise. -static bool isMatMulOperandAcc(isl::map AccMap, int &FirstPos, int &SecondPos) { - int DimInPos[] = {FirstPos, SecondPos}; - auto Lambda = [=, &DimInPos](isl::basic_map BasicMap) -> isl::stat { - auto Constraints = BasicMap.get_constraint_list(); - if (isl_constraint_list_n_constraint(Constraints.get()) != 2) - return isl::stat::error; - for (int i = 0; i < 2; i++) { - auto Constraint = - isl::manage(isl_constraint_list_get_constraint(Constraints.get(), i)); - int InPos, OutPos; - if (isMatMulOperandConstraint(Constraint, InPos, OutPos) == - isl_stat_error || - OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) - return isl::stat::error; - DimInPos[OutPos] = InPos; - } - return isl::stat::ok; - }; - if (AccMap.foreach_basic_map(Lambda) != isl::stat::ok || DimInPos[0] < 0 || - DimInPos[1] < 0) +static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos, + int &SecondPos) { + + isl::space Space = AccMap.get_space(); + isl::map Universe = isl::map::universe(Space); + + if (Space.dim(isl::dim::out) != 2) return false; - FirstPos = DimInPos[0]; - SecondPos = DimInPos[1]; - return true; + + // MatMul has the form: + // for (i = 0; i < N; i++) + // for (j = 0; j < M; j++) + // for (k = 0; k < P; k++) + // C[i, j] += A[i, k] * B[k, j] + // + // Permutation of three outer loops: 3! = 6 possibilities. + int FirstDims[] = {0, 0, 1, 1, 2, 2}; + int SecondDims[] = {1, 2, 2, 0, 0, 1}; + for (int i = 0; i < 6; i += 1) { + auto PossibleMatMul = + Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0) + .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1); + + AccMap = AccMap.intersect_domain(Domain); + PossibleMatMul = PossibleMatMul.intersect_domain(Domain); + + // If AccMap spans entire domain (Non-partial write), + // compute FirstPos and SecondPos. + // If AccMap != PossibleMatMul here (the two maps have been gisted at + // this point), it means that the writes are not complete, or in other + // words, it is a Partial write and Partial writes must be rejected. + if (AccMap.is_equal(PossibleMatMul)) { + if (FirstPos != -1 && FirstPos != FirstDims[i]) + continue; + FirstPos = FirstDims[i]; + if (SecondPos != -1 && SecondPos != SecondDims[i]) + continue; + SecondPos = SecondDims[i]; + return true; + } + } + + return false; } /// Does the memory access represent a non-scalar operand of the matrix @@ -627,18 +591,16 @@ if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead()) return false; auto AccMap = MemAccess->getLatestAccessRelation(); - if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC && - isl_map_n_basic_map(AccMap.get()) == 1) { + isl::set StmtDomain = MemAccess->getStatement()->getDomain(); + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) { MMI.ReadFromC = MemAccess; return true; } - if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A && - isl_map_n_basic_map(AccMap.get()) == 1) { + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) { MMI.A = MemAccess; return true; } - if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B && - isl_map_n_basic_map(AccMap.get()) == 1) { + if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) { MMI.B = MemAccess; return true; } @@ -758,8 +720,7 @@ if (!MemAccessPtr->isWrite()) return false; auto AccMap = MemAccessPtr->getLatestAccessRelation(); - if (isl_map_n_basic_map(AccMap.get()) != 1 || - !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) + if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j)) return false; MMI.WriteToC = MemAccessPtr; break; Index: lib/Transform/ScopInliner.cpp =================================================================== --- lib/Transform/ScopInliner.cpp +++ lib/Transform/ScopInliner.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/Inliner.h" #define DEBUG_TYPE "polly-scop-inliner" @@ -29,11 +30,15 @@ extern bool polly::PollyAllowFullFunction; namespace { -class ScopInliner : public CallGraphSCCPass { +class ScopInliner : public LegacyInlinerBase { +private: + std::map InlineCostCache; public: static char ID; - ScopInliner() : CallGraphSCCPass(ID) {} + ScopInliner() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) { + // initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); + } bool doInitialization(CallGraph &CG) override { if (!polly::PollyAllowFullFunction) { @@ -45,24 +50,26 @@ " enabled. " " If not, the entry block is not included in the Scop"); } - return true; + return LegacyInlinerBase::doInitialization(CG); + } + + bool doFinalization(CallGraph &CG) override { + InlineCostCache.clear(); + return LegacyInlinerBase::doFinalization(CG); } - bool runOnSCC(CallGraphSCC &SCC) override { - // We do not try to inline non-trivial SCCs because this would lead to - // "infinite" inlining if we are not careful. - if (SCC.size() > 1) - return false; - assert(SCC.size() == 1 && "found empty SCC"); - Function *F = (*SCC.begin())->getFunction(); - - // If the function is a nullptr, or the function is a declaration. - if (!F) - return false; - if (F->isDeclaration()) { - DEBUG(dbgs() << "Skipping " << F->getName() - << "because it is a declaration.\n"); - return false; + InlineCost getInlineCost(CallSite CS) override { + Function *F = CS.getCalledFunction(); + + if (!F || F->isDeclaration()) + return InlineCost::getNever(); + + DEBUG(dbgs() << "Scop inliner running on: " << F->getName() << " | "); + + std::map::iterator It; + if ((It = InlineCostCache.find(F)) != InlineCostCache.end()) { + DEBUG(dbgs() << "(cached) will inline? " << (bool)It->second << ".\n"); + return It->second; } PassBuilder PB; @@ -73,32 +80,47 @@ RegionInfo &RI = FAM.getResult(*F); ScopDetection &SD = FAM.getResult(*F); - const bool HasScopAsTopLevelRegion = - SD.ValidRegions.count(RI.getTopLevelRegion()) > 0; - - if (HasScopAsTopLevelRegion) { - DEBUG(dbgs() << "Skipping " << F->getName() - << " has scop as top level region"); - F->addFnAttr(llvm::Attribute::AlwaysInline); - - ModuleAnalysisManager MAM; - PB.registerModuleAnalyses(MAM); - ModulePassManager MPM; - MPM.addPass(AlwaysInlinerPass()); - Module *M = F->getParent(); - assert(M && "Function has illegal module"); - MPM.run(*M, MAM); - } else { - DEBUG(dbgs() << F->getName() - << " does NOT have scop as top level region\n"); + const auto TopLevelRegion = RI.getTopLevelRegion(); + + // Whether the entire function can be modeled as a Scop. + const bool IsFullyModeledAsScop = + SD.ValidRegions.count(TopLevelRegion) > 0; + + // Whether the scop contains all the children of the top-level region. + const bool IsModeledByTopLevelChildren = [&] { + for (auto ScopRegion : SD.ValidRegions) + if (ScopRegion->getParent() == TopLevelRegion) + return true; + return false; + }(); + + const InlineCost AnalyzedInlineCost = [&] { + if (IsFullyModeledAsScop || IsModeledByTopLevelChildren) + return InlineCost::getAlways(); + return InlineCost::getNever(); + }(); + + assert(InlineCostCache.find(F) == InlineCostCache.end() && + "Cached inlining analysis was not used."); + // Can't use InlineCostCache[F] = AnalyzedInlineCost because + // copy-ctor of InlineCost has been deleted. Joy. + InlineCostCache.insert(std::make_pair(F, AnalyzedInlineCost)); + DEBUG(dbgs() << "will inline? " << (bool)(AnalyzedInlineCost) << ".\n"); + + // If we decided to inline, then invalidate call site. + if (AnalyzedInlineCost) { + Function *Caller = CS.getCaller(); + assert(Caller && "Callsite has invalid caller"); + + InlineCostCache.erase(Caller); } - return false; - }; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - CallGraphSCCPass::getAnalysisUsage(AU); + return AnalyzedInlineCost; } + + // Do whatever alwaysinliner does. + bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); } + }; } // namespace Index: lib/Transform/ZoneAlgo.cpp =================================================================== --- lib/Transform/ZoneAlgo.cpp +++ lib/Transform/ZoneAlgo.cpp @@ -182,17 +182,15 @@ isl::union_set Writes, bool InclDef, bool InclRedef) { - // { DomainWrite[] -> Element[] } - auto Defs = give(isl_union_map_from_domain(Writes.take())); + isl::union_map Defs = isl::union_map::from_domain(Writes); // { [Element[] -> Scatter[]] -> DomainWrite[] } auto ReachDefs = computeReachingDefinition(Schedule, Defs, InclDef, InclRedef); // { Scatter[] -> DomainWrite[] } - return give(isl_union_set_unwrap( - isl_union_map_range(isl_union_map_curry(ReachDefs.take())))); + return ReachDefs.curry().range().unwrap(); } /// Compute the reaching definition of a scalar. @@ -209,16 +207,14 @@ static isl::map computeScalarReachingDefinition(isl::union_map Schedule, isl::set Writes, bool InclDef, bool InclRedef) { - auto DomainSpace = give(isl_set_get_space(Writes.keep())); - auto ScatterSpace = getScatterSpace(Schedule); + isl::space DomainSpace = Writes.get_space(); + isl::space ScatterSpace = getScatterSpace(Schedule); // { Scatter[] -> DomainWrite[] } - auto UMap = computeScalarReachingDefinition( - Schedule, give(isl_union_set_from_set(Writes.take())), InclDef, - InclRedef); + isl::union_map UMap = computeScalarReachingDefinition( + Schedule, isl::union_set(Writes), InclDef, InclRedef); - auto ResultSpace = give(isl_space_map_from_domain_and_range( - ScatterSpace.take(), DomainSpace.take())); + isl::space ResultSpace = ScatterSpace.map_from_domain_and_range(DomainSpace); return singleton(UMap, ResultSpace); } Index: test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll =================================================================== --- test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll +++ test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll @@ -4,3 +4,6 @@ define float @__nv_cosf(float %a) { ret float %a } +define float @__nv_logf(float %a) { + ret float %a +} Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- test/GPGPU/intrinsic-copied-into-kernel.ll +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -14,6 +14,7 @@ ; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: declare float @llvm.sqrt.f32(float) ; KERNEL-IR: declare float @llvm.fabs.f32(float) +; KERNEL-IR: declare float @llvm.powi.f32(float, i32) ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -26,7 +27,8 @@ ; float tmp1 = sqrt(tmp1); ; float tmp2 = fabs(tmp2); ; float tmp3 = copysignf(tmp1, tmp2); -; B[i] = tmp3; +; float tmp4 = powi(tmp3, 2); +; B[i] = tmp4; ; } ; } @@ -51,8 +53,9 @@ %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) %fabs = tail call float @llvm.fabs.f32(float %sqrt); %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); + %powi = tail call float @llvm.powi.f32(float %copysign, i32 2); %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %copysign, float* %B.arr.i, align 4 + store float %powi, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -70,6 +73,7 @@ declare float @llvm.sqrt.f32(float) #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.copysign.f32(float, float) #0 +declare float @llvm.powi.f32(float, i32) #0 attributes #0 = { nounwind readnone } Index: test/GPGPU/invalid-kernel-assert-verifymodule.ll =================================================================== --- test/GPGPU/invalid-kernel-assert-verifymodule.ll +++ test/GPGPU/invalid-kernel-assert-verifymodule.ll @@ -4,7 +4,7 @@ ; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually ; fail on an illegal module. -; REQUIRES: pollyacc +; REQUIRES: pollyacc, asserts ; XFAIL: * ; ; void foo(long A[1024], long B[1024]) { Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -20,6 +20,7 @@ ; Check that the intrinsic call is present in the kernel IR. ; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) +; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -29,9 +30,10 @@ ; void f(float *A, float *B, int N) { ; for(int i = 0; i < N; i++) { ; float tmp0 = A[i]; -; float tmp1 = expf(tmp1); -; tmp1 = cosf(tmp1); -; B[i] = tmp1; +; float expf = expf(tmp1); +; cosf = cosf(expf); +; logf = logf(cosf); +; B[i] = logf; ; } ; } @@ -55,8 +57,9 @@ ; Call to intrinsics that should be part of the kernel. %expf = tail call float @expf(float %A.arr.i.val) %cosf = tail call float @cosf(float %expf) + %logf = tail call float @logf(float %cosf) %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %expf, float* %B.arr.i, align 4 + store float %logf, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -73,6 +76,7 @@ ; Function Attrs: nounwind readnone declare float @expf(float) #0 declare float @cosf(float) #0 +declare float @logf(float) #0 attributes #0 = { nounwind readnone } Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll @@ -0,0 +1,59 @@ +; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) { +; for (int i = 0; i < 2; i+=1) +; for (int j = 0; j < 2; j+=1) +; for (int k = 0; k < 784; k+=1) +; C[i][j] += A[i][k] * B[k][j]; +;} +; +; Check that the pattern matching detects the matrix multiplication pattern +; when the AccMap cannot be reduced to a single disjunct. +; +; CHECK: The matrix multiplication pattern was detected +; +; ModuleID = 'pattern_matching_based_opts_splitmap.ll' +; +; Function Attrs: noinline nounwind uwtable +define void @pattern_matching_based_opts_splitmap([2 x double]* noalias dereferenceable(32) %C, [784 x double]* noalias dereferenceable(12544) %A, [2 x double]* noalias dereferenceable(12544) %B) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc21 + %i = phi i64 [ 0, %entry ], [ %add22, %for.inc21 ] + br label %for.body3 + +for.body3: ; preds = %for.body, %for.inc18 + %j = phi i64 [ 0, %for.body ], [ %add19, %for.inc18 ] + br label %for.body6 + +for.body6: ; preds = %for.body3, %for.body6 + %k = phi i64 [ 0, %for.body3 ], [ %add17, %for.body6 ] + %arrayidx8 = getelementptr inbounds [784 x double], [784 x double]* %A, i64 %i, i64 %k + %tmp6 = load double, double* %arrayidx8, align 8 + %arrayidx12 = getelementptr inbounds [2 x double], [2 x double]* %B, i64 %k, i64 %j + %tmp10 = load double, double* %arrayidx12, align 8 + %mul = fmul double %tmp6, %tmp10 + %arrayidx16 = getelementptr inbounds [2 x double], [2 x double]* %C, i64 %i, i64 %j + %tmp14 = load double, double* %arrayidx16, align 8 + %add = fadd double %tmp14, %mul + store double %add, double* %arrayidx16, align 8 + %add17 = add nsw i64 %k, 1 + %cmp5 = icmp slt i64 %add17, 784 + br i1 %cmp5, label %for.body6, label %for.inc18 + +for.inc18: ; preds = %for.body6 + %add19 = add nsw i64 %j, 1 + %cmp2 = icmp slt i64 %add19, 2 + br i1 %cmp2, label %for.body3, label %for.inc21 + +for.inc21: ; preds = %for.inc18 + %add22 = add nsw i64 %i, 1 + %cmp = icmp slt i64 %add22, 2 + br i1 %cmp, label %for.body, label %for.end23 + +for.end23: ; preds = %for.inc21 + ret void +} + Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop @@ -0,0 +1,46 @@ +{ + "arrays" : [ + { + "name" : "MemRef_A", + "sizes" : [ "*", "784" ], + "type" : "double" + }, + { + "name" : "MemRef_B", + "sizes" : [ "*", "2" ], + "type" : "double" + }, + { + "name" : "MemRef_C", + "sizes" : [ "*", "2" ], + "type" : "double" + } + ], + "context" : "{ : }", + "name" : "%for.body---%for.end23", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }" + } + ] +} Index: test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern_matching_based_opts_splitmap___%for.body---%for.end23.jscop.transformed @@ -0,0 +1,46 @@ +{ + "arrays" : [ + { + "name" : "MemRef_A", + "sizes" : [ "*", "784" ], + "type" : "double" + }, + { + "name" : "MemRef_B", + "sizes" : [ "*", "2" ], + "type" : "double" + }, + { + "name" : "MemRef_C", + "sizes" : [ "*", "2" ], + "type" : "double" + } + ], + "context" : "{ : }", + "name" : "%for.body---%for.end23", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] : i2 <= 784 - i0 - i1; Stmt_for_body6[1, 1, 783] -> MemRef_C[1, 1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1 and 0 <= i1 <= 1 and 0 <= i2 <= 783 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }" + } + ] +} Index: test/ScopInfo/allow-all-parameters-dereferencable.ll =================================================================== --- /dev/null +++ test/ScopInfo/allow-all-parameters-dereferencable.ll @@ -0,0 +1,98 @@ +; RUN: opt %loadPolly -analyze -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: -polly-scops < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \ +; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE-RTC + + +; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE + +; SCOP: Function: hoge +; SCOP-NEXT: Region: %bb15---%bb37 +; SCOP-NEXT: Max Loop Depth: 2 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp17, tmp28] -> { Stmt_bb29[i0] -> MemRef_arg1[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp17, tmp28] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp17, tmp28] -> { Stmt_bb27[] -> MemRef_arg[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp17, tmp28] -> { : } +; SCOP-NEXT: } + +; Check that without the option `-pollt-allow-dereference-of-all-function-parameters` +; we do generate the runtime check. +; CODE-RTC: polly.preload.cond: ; preds = %polly.preload.begin +; CODE-RTC-NEXT: br i1 %{{[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge + +; Check that we don't generate a runtime check because we treat all +; parameters as dereferencable. +; CODE-NOT: polly.preload.cond: ; preds = %polly.preload.begin +; CODE-NOT: br i1 %{{r1:[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@global = external global i32 + +; Function Attrs: nounwind uwtable +define void @hoge(i32* noalias %arg, i32* noalias %arg1, [0 x double]* noalias %arg2, float* %A) #0 { +bb: + %tmp = load i32, i32* @global, align 4 + %tmp3 = icmp sgt i32 %tmp, 1 + br label %bb14 + +bb14: ; preds = %bb + br label %bb15 + +bb15: ; preds = %bb14 + br i1 %tmp3, label %bb16, label %bb27 + +bb16: ; preds = %bb15 + %tmp17 = load i32, i32* %arg1, align 4 + br label %bb18 + +bb18: ; preds = %bb18, %bb16 + %tmp19 = phi i32 [ %tmp25, %bb18 ], [ 1, %bb16 ] + %tmp20 = sext i32 %tmp19 to i64 + %tmp21 = add nsw i64 %tmp20, -1 + %tmp22 = getelementptr [0 x double], [0 x double]* %arg2, i64 0, i64 %tmp21 + %tmp23 = bitcast double* %tmp22 to i64* + store i64 undef, i64* %tmp23, align 8 + %tmp24 = icmp eq i32 %tmp19, %tmp17 + %tmp25 = add i32 %tmp19, 1 + br i1 %tmp24, label %bb26, label %bb18 + +bb26: ; preds = %bb18 + br label %bb27 + +bb27: ; preds = %bb26, %bb15 + %tmp28 = load i32, i32* %arg, align 4 + store float 42.0, float* %A + br label %bb29 + +bb29: ; preds = %bb35, %bb27 + %tmp30 = load i32, i32* %arg1, align 4 + store float 42.0, float* %A + br label %bb31 + +bb31: ; preds = %bb31, %bb29 + %tmp32 = phi i32 [ undef, %bb31 ], [ 1, %bb29 ] + store float 42.0, float* %A + %tmp33 = icmp eq i32 %tmp32, %tmp30 + br i1 %tmp33, label %bb34, label %bb31 + +bb34: ; preds = %bb31 + br label %bb35 + +bb35: ; preds = %bb34 + %tmp36 = icmp eq i32 0, %tmp28 + br i1 %tmp36, label %bb37, label %bb29 + +bb37: ; preds = %bb35 + ret void +} + +attributes #0 = { nounwind uwtable } Index: test/ScopInliner/scop-from-entry-successor.ll =================================================================== --- /dev/null +++ test/ScopInliner/scop-from-entry-successor.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \ +; RUN: -polly-scops -analyze < %s | FileCheck %s + +; Check that we get the 2 nested loops by inlining `to_be_inlined` into +; `inline_site`, when the `to_be_inlined` has a `Scop` from the *successor* of +; the entry block to the exit node. + +; CHECK: Max Loop Depth: 2 + +; static const int N = 1000; +; +; void to_be_inlined(int A[]) { +; int B; // we do not allow dead code elimination on purpose to have an alloca. +; for(int i = 0; i < N; i++) +; A[i] *= 10; +; } +; +; void inline_site(int A[]) { +; for(int i = 0; i < N; i++) +; to_be_inlined(A); +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + + +define void @to_be_inlined(i32* %A) { +entry: + %alloc_to_block_modelling = alloca i32 + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv1 + %tmp = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %tmp, 10 + store i32 %mul, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @inline_site(i32* %A) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %for.body ] + tail call void @to_be_inlined(i32* %A) + %inc = add nuw nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -941,6 +941,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); static CuMemAllocFcnTy *CuMemAllocFcnPtr; +typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, + unsigned int); +static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; + typedef CUresult CUDAAPI CuLaunchKernelFcnTy( CUfunction F, unsigned int GridDimX, unsigned int GridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, @@ -1081,6 +1085,9 @@ CuMemAllocFcnPtr = (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuMemAllocManaged"); + CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); @@ -1445,7 +1452,7 @@ // If not, we pass it along to the underlying allocator. // This is a hack, and can be removed if the underlying issue is fixed. if (isManagedPtr(mem)) { - if (cudaFree(mem) != cudaSuccess) { + if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { fprintf(stderr, "cudaFree failed.\n"); exit(-1); } @@ -1465,15 +1472,18 @@ fprintf(stderr, "cudaMallocManaged called with size 0. " "Promoting to size 1"); size = max(size, 1); - polly_initContextCUDA(); - dump_function(); - void *a; - if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { + PollyGPUContext *_ = polly_initContextCUDA(); + assert(_ && "polly_initContextCUDA failed"); + + void *newMemPtr; + const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, + CU_MEM_ATTACH_GLOBAL); + if (Res != CUDA_SUCCESS) { fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); exit(-1); } - addManagedPtr(a); - return a; + addManagedPtr(newMemPtr); + return newMemPtr; } static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { Index: unittests/Isl/IslTest.cpp =================================================================== --- unittests/Isl/IslTest.cpp +++ unittests/Isl/IslTest.cpp @@ -783,6 +783,63 @@ computeArrayUnused(UMAP("{ Write[] -> [0] }"), UMAP("{ Write[] -> Elt[] }"), UMAP("{}"), ReadEltInSameInst, false, true)); + + // Two writes + EXPECT_EQ( + UMAP("{ Elt[] -> [i] : i <= 10 }"), + computeArrayUnused(UMAP("{ WriteA[] -> [0]; WriteB[] -> [10] }"), + UMAP("{ WriteA[] -> Elt[]; WriteB[] -> Elt[] }"), + UMAP("{}"), ReadEltInSameInst, false, true)); + + // Two unused zones + // read,write,read,write + EXPECT_EQ( + UMAP("{ Elt[] -> [i] : 0 < i <= 10; Elt[] -> [i] : 20 < i <= 30 }"), + computeArrayUnused(UMAP("{ ReadA[] -> [0]; WriteA[] -> [10]; ReadB[] " + "-> [20]; WriteB[] -> [30] }"), + UMAP("{ WriteA[] -> Elt[]; WriteB[] -> Elt[] }"), + UMAP("{ ReadA[] -> Elt[]; ReadB[] -> Elt[] }"), + ReadEltInSameInst, false, true)); + + // write, write + EXPECT_EQ( + UMAP("{ Elt[] -> [i] : i <= 10 }"), + computeArrayUnused( + UMAP("{ WriteA[] -> [0]; WriteB[] -> [10]; Read[] -> [20] }"), + UMAP("{ WriteA[] -> Elt[]; WriteB[] -> Elt[] }"), + UMAP("{ Read[] -> Elt[] }"), ReadEltInSameInst, false, true)); + + // write, read + EXPECT_EQ(UMAP("{ Elt[] -> [i] : i <= 0 }"), + computeArrayUnused(UMAP("{ Write[] -> [0]; Read[] -> [10] }"), + UMAP("{ Write[] -> Elt[] }"), + UMAP("{ Read[] -> Elt[] }"), ReadEltInSameInst, + false, true)); + + // read, write, read + EXPECT_EQ(UMAP("{ Elt[] -> [i] : 0 < i <= 10 }"), + computeArrayUnused( + UMAP("{ ReadA[] -> [0]; Write[] -> [10]; ReadB[] -> [20] }"), + UMAP("{ Write[] -> Elt[] }"), + UMAP("{ ReadA[] -> Elt[]; ReadB[] -> Elt[] }"), + ReadEltInSameInst, false, true)); + + // read, write, write + EXPECT_EQ( + UMAP("{ Elt[] -> [i] : 0 < i <= 20 }"), + computeArrayUnused( + UMAP("{ Read[] -> [0]; WriteA[] -> [10]; WriteB[] -> [20] }"), + UMAP("{ WriteA[] -> Elt[]; WriteB[] -> Elt[] }"), + UMAP("{ Read[] -> Elt[] }"), ReadEltInSameInst, false, true)); + + // read, write, write, read + EXPECT_EQ( + UMAP("{ Elt[] -> [i] : 0 < i <= 20 }"), + computeArrayUnused(UMAP("{ ReadA[] -> [0]; WriteA[] -> [10]; WriteB[] " + "-> [20]; ReadB[] -> [30] }"), + UMAP("{ WriteA[] -> Elt[]; WriteB[] -> Elt[] }"), + UMAP("{ ReadA[] -> Elt[]; ReadB[] -> Elt[] }"), + ReadEltInSameInst, false, true)); } // Read and write in same statement