Index: include/llvm/Analysis/LoopCostAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/LoopCostAnalysis.h @@ -0,0 +1,159 @@ +//===----- LoopCostAnalysis.h - Analyse a loop for its cost ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a set of routines that does the loop cost analysis. +// +// Prerequisite Reading: +// Compiler Optimizations for Improving Data Locality [Carr-McKinley-Tseng] + +// High level details: +// Look through the memory accesses and create groups of references such that +// two references fall into different groups if they are accessed in different +// cache lines. Each group is then analysed with respect to innermost loops +// considering cache lines. + +// Penalty for the reference is: +// a. 1 if the reference is invariant with the innermost loop, +// b. TripCount for non-unit stride access, +// c. TripCount/CacheLineSize for a unit-stride access. + +// Loop cost is the sum total of the reference penalties times the product of +// the loop bounds of the outer loops. + +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Instructions.h" + +namespace llvm { + +// This data is mainly used for cache resue (spatial/temporal) aware +// calculations. +// TODO: This should probably go to a separate file with separate +// implemenation if the size grows. +class CacheData { + unsigned LineSize; // The number of words in a cache line. + unsigned CacheSize; // The size of cache. + enum CacheWay { + DIRECT, + WAY2, + WAY4, + WAY8, + FULL + } Associativity; + + // TODO: Need to add other cache details as required. + +public: + CacheData() {} + + CacheData(unsigned _LS, unsigned _CS, CacheData::CacheWay _Assoc) { + LineSize = _LS; + CacheSize = _CS; + Associativity = _Assoc; + } + + // Get/set member functions. + void setLineSize(unsigned Size) { LineSize = Size; } + unsigned getLineSize() { return LineSize; } + + void setCacheSize(unsigned Size) { CacheSize = Size; } + unsigned getCacheSize() { return CacheSize; } + + void setAssociativity(CacheData::CacheWay Assoc) { Associativity = Assoc; } + CacheData::CacheWay getAssociativity() { return Associativity; } + + void initCacheData() { + // TODO: Get the cache data from the target architecture. + // Set default values for generic data layout. + LineSize = 4; // Statically setting for now. + } +}; + +// TODO: Add a class RegData that will be mainly used for reg reuse aware +// calculations. + +// Ordered list of loops from outer loop to the innermost loop that form the +// loop nest. +typedef SmallVector LoopNest_t; + +// This is a utility class that can be used to calculate cache aware loop costs +// for a perfectly nested loop nest. +class LoopCost { +public: + enum Order { + COLUMNMAJOR = 0, + ROWMAJOR + } AccessOrder; + +private: + CacheData Cache; + ScalarEvolution *SCEV; + + /// List of loops in loopnest with their tripcounts. + typedef std::pair loopTripcount; + SmallVector LoopTripCounts; + void setTripCounts(LoopNest_t LN); + + /// The reference groups - directly noting the GEPs instead of load/stores. + SmallVector ReferenceGroups; + void CreateReferenceGroups(BasicBlock *bb); + + /// The loops and their calculated loop costs. + std::map LoopCosts; + + bool ASTMatch(Value *Operand, PHINode *P); + +public: + LoopCost(ScalarEvolution *_SCEV) + : SCEV(_SCEV) { + Cache.initCacheData(); + AccessOrder = ROWMAJOR; + } + + // Given a perfect nest, calculate loop costs of the loops in the nest. + void calculateLoopCosts(LoopNest_t LN); + + double getLoopCostOf(Loop *L); + + // Print routines. + void printLoopCosts(); + void printTripCounts(); + void printReferenceGroups(); +}; + + +class LoopCostAnalysis : public FunctionPass { + LoopCost *LC; + +public: + static char ID; + + LoopCostAnalysis() : FunctionPass(ID) { + initializeLoopCostAnalysisPass(*PassRegistry::getPassRegistry()); + } + + const LoopCost &getLoopCosts() const { return *LC; } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + } +}; +} Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -74,6 +74,13 @@ //===--------------------------------------------------------------------===// // + // createLoopCostAnalysisPass - This pass assigns a numerical cost to each + // loop in the loop nest considering cache and register data. + // + FunctionPass *createLoopCostAnalysisPass(); + + //===--------------------------------------------------------------------===// + // // createRegionInfoPass - This pass finds all single entry single exit regions // in a function and builds the region hierarchy. // Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -122,6 +122,7 @@ void initializeEarlyIfConverterPass(PassRegistry&); void initializeEdgeBundlesPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); +void initializeLoopCostAnalysisPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry &); void initializeGCOVProfilerLegacyPassPass(PassRegistry&); void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -109,6 +109,7 @@ (void) llvm::createLCSSAPass(); (void) llvm::createLICMPass(); (void) llvm::createLazyValueInfoPass(); + (void) llvm::createLoopCostAnalysisPass(); (void) llvm::createLoopExtractorPass(); (void) llvm::createLoopInterchangePass(); (void) llvm::createLoopSimplifyPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -45,6 +45,7 @@ initializeDomOnlyViewerPass(Registry); initializePostDomViewerPass(Registry); initializeDomOnlyPrinterPass(Registry); + initializeLoopCostAnalysisPass(Registry); initializePostDomPrinterPass(Registry); initializePostDomOnlyViewerPass(Registry); initializePostDomOnlyPrinterPass(Registry); Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -40,6 +40,7 @@ Lint.cpp Loads.cpp LoopAccessAnalysis.cpp + LoopCostAnalysis.cpp LoopUnrollAnalyzer.cpp LoopInfo.cpp LoopPass.cpp Index: lib/Analysis/LoopCostAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/LoopCostAnalysis.cpp @@ -0,0 +1,382 @@ +//===----- LoopCostAnalysis.cpp - Analyse a loop for its cost -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a set of routines that does the loop cost analysis. +// TODO: Extend the Loop Cost calculation to imperfect nests and more than one +// basicblock in the innermost loop. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopCostAnalysis.h" + +using namespace llvm; + +#define DEBUG_TYPE "loop-cost" + +/* - Build/test perfect loop nests - should probably be moved to Loop Utils - */ +// TODO: Rotated loops forming perfect nests cannot be relied on and hence not +// considered for now. +bool IsRotatedLoop(Loop *L) { + return (L->getHeader() != L->getExitingBlock()); // Probably rotated loop. +} + +// Return true if all blocks of @L are perfectly nested under @L's subloop. @L's +// header and latch are exempted though. +bool BlocksPerfectlyNestedUnder(Loop *L) { + assert ((L->getSubLoops().size() == 1) && + "Expected loop containing a single subloop!"); + Loop *Subloop = *L->begin(); + for (auto bb = L->block_begin(), bbe = L->block_end(); bb != bbe; ++bb) { + if (*bb == L->getHeader() || *bb == L->getLoopLatch()) + continue; + if (!Subloop->contains(*bb)) { + // Ignore empty blocks. + auto bbBr = dyn_cast((*bb)->getTerminator()); + if ((*bb)->size() == 1 && (bbBr && bbBr->isUnconditional())) + continue; + return false; + } + } + return true; +} + +bool hasSimpleHeaderLatch(Loop *L) { + // TODO: Unrotated loops form perfect nests only if loop's header and latch + // contain only loop control updates and exit check. Any other instruction is + // deemed to violate perfect nesting. + return true; +} + +bool +PopulatePerfectLoopNestsUnder(Loop *L, + SmallVectorImpl &PerfectLoopNests) { + // Base case: the innermost loop. + if (L->begin() == L->end()) { + if (!IsRotatedLoop(L) && hasSimpleHeaderLatch(L)) { + LoopNest_t LN; + LN.push_back(L); + PerfectLoopNests.push_back(LN); + return true; + } + return false; + } + + // Iterate over subloops. + bool PerfectSubnest = true; + for (Loop::iterator i = L->begin(), e = L->end(); i != e; i++) + PerfectSubnest &= PopulatePerfectLoopNestsUnder(*i, PerfectLoopNests); + + // If perfect and single subnest. + if (PerfectSubnest && (L->getSubLoops().size() == 1) && !IsRotatedLoop(L) && + hasSimpleHeaderLatch(L) && BlocksPerfectlyNestedUnder(L)) { + // Add this loop to perfect nest. + LoopNest_t *LN = &PerfectLoopNests[PerfectLoopNests.size() - 1]; + LN->insert(LN->begin(), L); + return true; + } + return false; +} + +// Check if the loops in @LN form a perfect nest. Loops in @LN should be listed +// from outermost loop to innermost loop of the loop nest. +bool IsPerfectNest(LoopNest_t LN) { + auto l = LN.rbegin(); // Start with innermost loop. + Loop *Innermost = *l; + if ((Innermost->begin() != Innermost->end()) || IsRotatedLoop(Innermost) || + !hasSimpleHeaderLatch(Innermost)) + return false; + + Loop *Subloop = Innermost; + ++l; + for (auto le = LN.rend(); l != le; ++l) { + Loop *L = *l; + if ((L->getSubLoops().size() != 1) || (*L->begin() != Subloop) || + IsRotatedLoop(L) || !hasSimpleHeaderLatch(L) || + !BlocksPerfectlyNestedUnder(L)) + return false; + Subloop = L; + } + return true; +} + +// LoopCost implementation. +// Print the loop costs of all the loops in the loopnest. +void LoopCost::printLoopCosts() { + DEBUG( + dbgs() << "Printing Loop Costs: "; + if (LoopCosts.empty()) + dbgs() << "(empty)"; + dbgs() << "\n"; + for (auto lc : LoopCosts) { + dbgs() << "Loop: " << lc.first->getHeader()->getName(); + dbgs() << "\tCosts: " << lc.second << "\n"; + } + ); +} + +// Print the reference groups. +void LoopCost::printReferenceGroups() { + DEBUG( + dbgs() << "Printing Reference Groups (GEPs): "; + if (ReferenceGroups.begin() == ReferenceGroups.end()) + dbgs() << "(empty)"; + dbgs() << "\n"; + for (auto ri : ReferenceGroups) + dbgs() << "Ref group: " << *ri << "\n"; + ); +} + +// Print the loop trip counts of each of the loops in the loopnest. +void LoopCost::printTripCounts() { + DEBUG( + dbgs() << "Printing Trip Counts: "; + if (LoopTripCounts.empty()) + dbgs() << "(empty)"; + dbgs() << "\n"; + for (auto ltc : LoopTripCounts) { + dbgs() << "Loop: " << ltc.first->getHeader()->getName(); + dbgs() << "\tTripCount: " << ltc.second << "\n"; + } + ); +} + +// Return loop cost of @L. Return -1.0 if loop is not found in map. +double LoopCost::getLoopCostOf(Loop *L) { + auto LC = LoopCosts.find(L); + if (LC == LoopCosts.end()) + return -1.0; + return LC->second; +} + +// Set the trip counts for each of the loops in the loopnest. If trip count for +// a loop is not found, a normalized value based on the surrounding loops is +// set or statically set to STATIC_TRIP_COUNT. +void LoopCost::setTripCounts(LoopNest_t LN) { +#define STATIC_TRIP_COUNT 1000 + + for (auto L : LN) { + unsigned TripCount = SCEV->getSmallConstantTripCount(L, L->getExitingBlock()); + LoopTripCounts.push_back(std::make_pair(L, (TripCount))); + } + + // Normalize trip counts: reset trip counts if 0. + // TODO: Can do better? + for (auto li = LoopTripCounts.begin(), le = LoopTripCounts.end(); + li != le; li++) { + unsigned TC = (*li).second; + if (TC) + continue; + // TC is 0, reset to average of the surrounding loop's tripcount. + auto prev = li, next = li; + if (li == LoopTripCounts.begin()) { + ++next; + (*li).second = (*next).second; + } else if (li == (LoopTripCounts.end() - 1)) { + --prev; + (*li).second = (*prev).second; + } else { + --prev; ++next; + (*li).second = ((*prev).second + (*next).second) / 2; + } + if ((*li).second == 0) // Still 0? Statically set! + (*li).second = STATIC_TRIP_COUNT; + } +} + +// This routine creates different Reference (GEP) Groups based on GEP's access +// w.r.t. CacheLine. A GEP access is added to an existing group if it's access +// lies in the same cache line; otherwise this GEP will form a new RefGroup. +// Groups can be made based on the following: +// a. A GEP having different number of operands will form a different group. +// b. Only the last operand of GEP is analysed as that is what matters with the +// cache line. +// c. Two GEPs with their last operands - i and i + 4 falls into same RefGroup +// if SCEV(i) - SCEV(i + 4) is a constant that is less than the cache line. +// TODO: Add alignment info. +void LoopCost::CreateReferenceGroups(BasicBlock *bb) { + assert(bb); + for (auto &i : *bb) { + if (auto *GEP = dyn_cast(&i)) { + bool UniqueGEP = true; + auto NumOps = GEP->getNumOperands(); + auto LastOp = GEP->getOperand(NumOps - 1); + if (SCEV->isSCEVable(LastOp->getType())) { + for (auto RG : ReferenceGroups) { + auto RGNumOps = RG->getNumOperands(); + if (RGNumOps != NumOps) + continue; + + // All n - 1 GEP operands must be equal. + bool EqualOps = true; + for (unsigned i = 0, e = NumOps - 1; (EqualOps && i != e); ++i) + if (GEP->getOperand(i) != RG->getOperand(i)) + EqualOps = false; + if (!EqualOps) + continue; + + auto RGLastOp = RG->getOperand(RGNumOps - 1); + if (!SCEV->isSCEVable(RGLastOp->getType())) + continue; + + // Check if (LastOp - RGLastOp) lies within cache line. + auto GEPSCEV = SCEV->getSCEV(LastOp); + auto RGSCEV = SCEV->getSCEV(RGLastOp); + auto DiffSCEV = + dyn_cast(SCEV->getMinusSCEV(GEPSCEV, RGSCEV)); + if (DiffSCEV->getValue()->getSExtValue() < Cache.getLineSize()) { + UniqueGEP = false; + break; + } + } + } + if (UniqueGEP) + ReferenceGroups.push_back(GEP); + } + } +} + +// Return true if the phinode is used in the computation of the operand. +// TODO: Use SCEVTraversal or some similar thing. +bool LoopCost::ASTMatch(Value *Operand, PHINode *P) { + if (Operand == P) + return true; + + auto I = dyn_cast(Operand); + if (!I || isa(I)) + return false; + + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (ASTMatch(I->getOperand(i), P)) + return true; + return false; +} + +BasicBlock *getInnerSingleBB(Loop *L) { + BasicBlock *BB = nullptr; + for (auto bb = L->block_begin(), bbe = L->block_end(); bb != bbe; ++bb) { + if (*bb == L->getHeader() || *bb == L->getLoopLatch()) + continue; + + // Ignore empty blocks. + auto bbBr = dyn_cast((*bb)->getTerminator()); + if ((*bb)->size() == 1 && (bbBr && bbBr->isUnconditional())) + continue; + + if (BB) // A BB was already found. + return nullptr; // There are multiple BBs. + + BB = *bb; + } + return BB; +} + +void LoopCost::calculateLoopCosts(LoopNest_t LN) { + // Initialize cost of all loops in nest to -1. + for (auto L : LN) + LoopCosts[L] = -1.0; + + if (!IsPerfectNest(LN)) + return; + + // For now, restricting to a single bb in innermost loop. This will be + // relaxed soon. + BasicBlock *innermostBB = getInnerSingleBB(*LN.rbegin()); + if (!innermostBB) + return; + + CreateReferenceGroups(innermostBB); + setTripCounts(LN); + + // Calculate loop costs for each of the loop in the loopnest assuming loop to + // be the innermost loop. + for (auto L : LN) { + double ThisLoopCost = 0.0; + + PHINode *P = L->getCanonicalInductionVariable(); + if (!P) { + DEBUG(dbgs() << "Could not find induction variable\n"); + continue; + } + DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() + << " phi: " << *P << "\n"); + + // Calculate penalties from other loops. + double ThisLoopPenalty = 1.0; + double OtherLoopPenalties = 1.0; + for (auto ltc : LoopTripCounts) { + if (ltc.first == L) + ThisLoopPenalty = ltc.second; + else + OtherLoopPenalties = OtherLoopPenalties * ltc.second; + } + assert((ThisLoopPenalty > 0 && OtherLoopPenalties > 0) + && "Incorrect loop penalty"); + DEBUG(dbgs() << "ThisLoopPenalty: " << ThisLoopPenalty << "\n"); + DEBUG(dbgs() << "OtherLoopPenalties: " << OtherLoopPenalties << "\n"); + + for (auto GEP : ReferenceGroups) { + // Check stride of this GEP access. The position of use of 'P' is + // checked in the operands of GEP. TODO: If the operands of GEP do not + // use 'P' directly, the AST has to be traversed starting from each + // operand of GEP in order to find the use of 'P'. + DEBUG(dbgs() << "GEP: " << *GEP << "\n"); + double ThisRefPenalty = 1.0; // Is 1 for an invariant reference. + for (unsigned ni = 1, ne = GEP->getNumIndices(); ni <= ne; ni++) { + if (ASTMatch(GEP->getOperand(ni), P)) { + // Check stride access. + if ((AccessOrder == ROWMAJOR && ni == ne) + || (AccessOrder == COLUMNMAJOR && ni == 2)) { // ni == 1 is primary offset. + // Contiguous locations. + ThisRefPenalty = ThisLoopPenalty / Cache.getLineSize(); + } else { + // Non-contiguous locations. + ThisRefPenalty = ThisLoopPenalty; + } + DEBUG(dbgs() << "ThisRefPenalty: " << ThisRefPenalty << "\n"); + } + } + ThisLoopCost = ThisLoopCost + ThisRefPenalty * OtherLoopPenalties; + DEBUG(dbgs() << "Accum cost of this loop: " << ThisLoopCost << "\n\n"); + } + LoopCosts[L] = ThisLoopCost; + } +} + +bool LoopCostAnalysis::runOnFunction(Function &F) { + auto LI = &getAnalysis().getLoopInfo(); + auto SCEV = &getAnalysis().getSE(); + LC = new LoopCost(SCEV); + + DEBUG(dbgs() << "Calculate LoopCosts in Function: " << F.getName() << "\n"); + SmallVector PerfectLoopNests; + // Build perfect loop nests. + for (auto L = LI->begin(), Le = LI->end(); L != Le; ++L) + PopulatePerfectLoopNestsUnder(*L, PerfectLoopNests); + + for (auto PN : PerfectLoopNests) + LC->calculateLoopCosts(PN); + + LC->printLoopCosts(); + LC->printTripCounts(); + return false; +} + +char LoopCostAnalysis::ID = 0; + +auto PassDesc = "Experimental, Cache aware Loop Cost Analysis"; +INITIALIZE_PASS_BEGIN(LoopCostAnalysis, "loop-cost", PassDesc, false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopCostAnalysis, "loop-cost", PassDesc, false, true) + +namespace llvm { +FunctionPass *createLoopCostAnalysisPass() { + return new LoopCostAnalysis(); +} +} Index: test/Analysis/CostModel/X86/matmul-perfect-loopCost.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/X86/matmul-perfect-loopCost.ll @@ -0,0 +1,102 @@ +; RUN: opt -loop-cost -debug %s -o /dev/null 2>&1 | FileCheck %s +; Test Loop Cost Analysis of perfectly nested matrix multiplication code having +; global arrays. +; Generated from matmul-perfect.c: +; #include +; #define SIZE 5000 +; int a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE]; +; void matmul() { +; int i, j, k; +; for (i = 0; i < SIZE; i++) { +; for (j = 0; j < SIZE; j++) { +; for (k = 0; k < SIZE; k++) { +; c[i][j] = c[i][j] + a[i][k] * b[k][j]; +; } +; } +; } +; } +; with clang -emit-llvm matmul-perfect.c -S -o matmul-perfect.clang.ll +; then opt -mem2reg -loop-simplify -instcombine -instnamer -indvars -S -o matmul-perfect.opt.ll + +; CHECK: Loop: for.cond4 Costs: 1.563688e+11 +; CHECK-NEXT: Loop: for.cond1 Costs: 6.256252e+10 +; CHECK-NEXT: Loop: for.cond Costs: 2.501750e+11 +; CHECK: Loop: for.cond TripCount: 5001 +; CHECK-NEXT: Loop: for.cond1 TripCount: 5001 +; CHECK-NEXT: Loop: for.cond4 TripCount: 5001 + +; ModuleID = 'matmul-perfect.clang.bc' +source_filename = "matmul-perfect.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@c = common global [5000 x [5000 x i32]] zeroinitializer, align 16 +@a = common global [5000 x [5000 x i32]] zeroinitializer, align 16 +@b = common global [5000 x [5000 x i32]] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @matmul() #0 { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %indvars.iv6 = phi i64 [ %indvars.iv.next7, %for.inc24 ], [ 0, %entry ] + %exitcond8 = icmp ne i64 %indvars.iv6, 5000 + br i1 %exitcond8, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc21, %for.body + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc21 ], [ 0, %for.body ] + %exitcond5 = icmp ne i64 %indvars.iv3, 5000 + br i1 %exitcond5, label %for.body3, label %for.end23 + +for.body3: ; preds = %for.cond1 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body3 ] + %exitcond = icmp ne i64 %indvars.iv, 5000 + br i1 %exitcond, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %arrayidx8 = getelementptr inbounds [5000 x [5000 x i32]], [5000 x [5000 x i32]]* @c, i64 0, i64 %indvars.iv6, i64 %indvars.iv3 + %tmp = load i32, i32* %arrayidx8, align 4 + %arrayidx12 = getelementptr inbounds [5000 x [5000 x i32]], [5000 x [5000 x i32]]* @a, i64 0, i64 %indvars.iv6, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx12, align 4 + %arrayidx16 = getelementptr inbounds [5000 x [5000 x i32]], [5000 x [5000 x i32]]* @b, i64 0, i64 %indvars.iv, i64 %indvars.iv3 + %tmp2 = load i32, i32* %arrayidx16, align 4 + %mul = mul nsw i32 %tmp1, %tmp2 + %add = add nsw i32 %tmp, %mul + %arrayidx20 = getelementptr inbounds [5000 x [5000 x i32]], [5000 x [5000 x i32]]* @c, i64 0, i64 %indvars.iv6, i64 %indvars.iv3 + store i32 %add, i32* %arrayidx20, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc21 + +for.inc21: ; preds = %for.end + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + br label %for.cond1 + +for.end23: ; preds = %for.cond1 + br label %for.inc24 + +for.inc24: ; preds = %for.end23 + %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1 + br label %for.cond + +for.end26: ; preds = %for.cond + ret void +} + +attributes #0 = { nounwind uwtable } + +!llvm.ident = !{!0} + +!0 = !{!""}