Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -411,6 +411,10 @@ return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } + /// \return The size in bits of the widest memory access in the expression + /// tree ending at \p V. This method is used to calculate vector factors. + unsigned getMemAccessWidth(Value *V); + private: struct TreeEntry; @@ -3139,10 +3143,73 @@ BS->ScheduleStart = nullptr; } +unsigned BoUpSLP::getMemAccessWidth(Value *V) { + auto &DL = F->getParent()->getDataLayout(); + + // If V is a store, just return the width of the stored value without + // traversing the expression tree. This is the common case. + if (auto *Store = dyn_cast(V)) + return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); + + // If V is not a store, we can traverse the expression tree to find loads + // that feed it. The type of the loaded value may indicate a more suitable + // width than V's type. We want to base the vector factor on the width of + // memory operations where possible. + SmallVector Worklist; + SmallPtrSet Visited; + if (auto *I = dyn_cast(V)) + Worklist.push_back(I); + + // Traverse the expression tree in bottom-up order looking for loads. If we + // encounter an instruciton we don't yet handle, we give up. + auto MaxWidth = 0u; + auto FoundUnknownInst = false; + while (!Worklist.empty() && !FoundUnknownInst) { + auto *I = Worklist.pop_back_val(); + Visited.insert(I); + + // We should only be looking at scalar instructions here. If the current + // instruction has a vector type, give up. + auto *Ty = I->getType(); + if (isa(Ty)) + FoundUnknownInst = true; + + // If the current instruction is a load, update MaxWidth to reflect the + // width of the loaded value. + else if (isa(I)) + MaxWidth = std::max(MaxWidth, (unsigned) DL.getTypeSizeInBits(Ty)); + + // Otherwise, we need to visit the operands of the instruction. We only + // handle the interesting cases from buildTree here. If an operand is an + // instruction we haven't yet visited, we add it to the worklist. + else if (isa(I) || isa(I) || isa(I) || + isa(I) || isa(I) || isa(I)) { + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) + if (!Visited.count(J)) + Worklist.push_back(J); + } + + // If we don't yet handle the instruction, give up. + else + FoundUnknownInst = true; + } + + // If we didn't encounter a memory access in the expression tree, or if we + // gave up for some reason, just return the width of V. + if (!MaxWidth || FoundUnknownInst) + return DL.getTypeSizeInBits(V->getType()); + + // Otherwise, return the maximum width we found. + return MaxWidth; +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector StoreList; typedef MapVector StoreListMap; + typedef SmallVector LoadList; + typedef MapVector LoadListMap; /// Pass identification, replacement for typeid static char ID; @@ -3173,6 +3240,7 @@ AC = &getAnalysis().getAssumptionCache(F); StoreRefs.clear(); + LoadRefs.clear(); bool Changed = false; // If the target claims to have no vector registers don't attempt @@ -3206,15 +3274,24 @@ // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + // Vectorize trees that end at stores. - if (unsigned count = collectStores(BB, R)) { - (void)count; - DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); + auto NumStore = collectStores(BB, R); + if (NumStore > 0) { + DEBUG(dbgs() << "SLP: Found " << NumStore << " stores to vectorize.\n"); Changed |= vectorizeStoreChains(R); } // Vectorize trees that end at reductions. Changed |= vectorizeChainsInBlock(BB, R); + + // Vectorize trees that end at loads. This is intended to catch + // gather-like idioms ending at non-consecutive loads. + auto NumLoad = collectLoads(BB, R); + if (NumLoad > 0) { + DEBUG(dbgs() << "SLP: Found " << NumLoad << " loads to vectorize.\n"); + Changed |= vectorizeLoadChains(BB, R); + } } if (Changed) { @@ -3240,12 +3317,15 @@ private: - /// \brief Collect memory references and sort them according to their base - /// object. We sort the stores to their base objects to reduce the cost of the - /// quadratic search on the stores. TODO: We can further reduce this cost - /// if we flush the chain creation every time we run into a memory barrier. + /// \brief Collect stores and sort them according to their base object. We + /// sort the stores by their base objects to reduce the cost of the quadratic + /// search on the stores. TODO: We can further reduce this cost if we flush + /// the chain creation every time we run into a memory barrier. unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + /// \brief Collect loads and sort them according to their base object. + unsigned collectLoads(BasicBlock *BB, BoUpSLP &R); + /// \brief Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); @@ -3263,6 +3343,9 @@ /// \brief Vectorize the stores that were collected in StoreRefs. bool vectorizeStoreChains(BoUpSLP &R); + /// \brief Vectorize the loads that were collected in LoadRefs. + bool vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); @@ -3274,6 +3357,7 @@ BoUpSLP &R); private: StoreListMap StoreRefs; + LoadListMap LoadRefs; unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; @@ -3294,9 +3378,7 @@ unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - auto &DL = cast(Chain[0])->getModule()->getDataLayout(); - unsigned Sz = DL.getTypeSizeInBits(StoreTy); + unsigned Sz = R.getMemAccessWidth(Chain[0]); unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3407,6 +3489,33 @@ return Changed; } +unsigned SLPVectorizer::collectLoads(BasicBlock *BB, BoUpSLP &R) { + unsigned count = 0; + LoadRefs.clear(); + const DataLayout &DL = BB->getModule()->getDataLayout(); + for (Instruction &I : *BB) { + LoadInst *LI = dyn_cast(&I); + if (!LI) + continue; + + // Don't touch volatile loads. + if (!LI->isSimple()) + continue; + + // Check that the pointer points to scalars. + Type *Ty = LI->getType(); + if (!isValidElementType(Ty)) + continue; + + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL); + + // Save the load locations. + LoadRefs[Ptr].push_back(LI); + count++; + } + return count; +} unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; @@ -3457,12 +3566,10 @@ return false; unsigned Opcode0 = I0->getOpcode(); - const DataLayout &DL = I0->getModule()->getDataLayout(); - Type *Ty0 = I0->getType(); - unsigned Sz = DL.getTypeSizeInBits(Ty0); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. + unsigned Sz = R.getMemAccessWidth(I0); unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -4170,6 +4277,78 @@ return Changed; } +bool SLPVectorizer::vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R) { + auto Changed = false; + auto &DL = BB->getModule()->getDataLayout(); + for (auto &Entry : LoadRefs) { + + // If the load list has fewer than two elements, there's nothing to do. + if (Entry.second.size() < 2) + continue; + + DEBUG(dbgs() << "SLP: Analyzing a load chain of length " + << Entry.second.size() << ".\n"); + + // If the load list accesses contiguous memory, it doesn't make sense to + // pursue it further in a bottom-up phase. It should have already been + // vectorized if profitable. The code below gives up if a single pair of + // consecutive accesses is found. + auto IsConsecutive = false; + for (unsigned I = 0; I < Entry.second.size() && !IsConsecutive; ++I) + for (unsigned J = 0; J < Entry.second.size() && !IsConsecutive; ++J) + if (I != J) + if (R.isConsecutiveAccess(Entry.second[I], Entry.second[J], DL)) + IsConsecutive = true; + if (IsConsecutive) + continue; + + // Try and vectorize trees ending at non-consecutive loads. We are currenly + // only interested in gather-like cases of the form: + // + // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... + // + // where the loads of "a", the loads of "b", and the subtractions can be + // performed in parallel. It's likely that detecting this pattern + // in a bottom-up phase will be simpler and less costly than building a + // full-blown top-down phase beginning at the consecutive loads. + SmallVector Bundle; + for (auto VH : Entry.second) { + + // The load may have been vectorized after we initially collected it. If + // so, the WeakVH will have nullified the value. + auto *Load = dyn_cast_or_null(VH); + if (!Load) + break; + + // We only handle loads in which the pointer operand is a getelementptr. + // If we encounter anything else, give up. + auto *GEP = dyn_cast(Load->getPointerOperand()); + if (!GEP) + break; + + // Furthermore, we only handle getelementptr instructions having a single + // non-constant index. Constant indices are not interesting, and we limit + // ourselves to getelementptr's with a single index to save compile time. + // If we encounter anything else, give up. + // + // TODO: Relax these restrictions if we find an interesting case that we + // miss and compile time is not an issue. + auto *Idx = GEP->idx_begin()->get(); + if (GEP->getNumIndices() > 1 || isa(Idx)) + break; + Bundle.push_back(Idx); + } + + // If the bundle has fewer than two elements, there's nothing to do. + if (Bundle.size() < 2) + continue; + + // Try to vectorize the bundle. + Changed |= tryToVectorizeList(Bundle, R); + } + return Changed; +} + bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. Index: test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -0,0 +1,262 @@ +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we vectorize the index calculations in the +; gather-reduce pattern shown below. We check cases having i32 and i64 +; subtraction. +; +; int gather_reduce_8x16(short *a, short *b, short *g, int n) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; } +; return sum; +; } + +; CHECK-LABEL: @gather_reduce_8x16_i32 +; +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; CHECK: zext <8 x i16> [[L]] to <8 x i32> +; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i32 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1 + %1 = load i16, i16* %b.addr.0100, align 2 + %conv2 = zext i16 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i32 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i32 + %sub8 = sub nsw i32 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i32 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i32 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i32 + %sub26 = sub nsw i32 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i32 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i32 + %sub35 = sub nsw i32 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i32 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i32 + %sub44 = sub nsw i32 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i32 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i32 + %sub53 = sub nsw i32 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i32 + %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i32 + %sub62 = sub nsw i32 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @gather_reduce_8x16_i64 +; +; CHECK-NOT: load <8 x i16> +; +; FIXME: We are currently unable to vectorize the case with i64 subtraction +; because the zero extensions are too expensive. The solution here is to +; convert the i64 subtractions to i32 subtractions during vectorization. +; This would then match the case above. +; +define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i64 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1 + %1 = load i16, i16* %b.addr.0100, align 2 + %conv2 = zext i16 %1 to i64 + %sub = sub nsw i64 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i64 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i64 + %sub8 = sub nsw i64 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i64 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i64 + %sub17 = sub nsw i64 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i64 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i64 + %sub26 = sub nsw i64 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i64 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i64 + %sub35 = sub nsw i64 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i64 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i64 + %sub44 = sub nsw i64 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i64 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i64 + %sub53 = sub nsw i64 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i64 + %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i64 + %sub62 = sub nsw i64 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +}