Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -411,6 +411,10 @@
     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
   }
 
+  /// \return The size in bits of the widest memory access in the expression
+  /// tree ending at \p V. This method is used to calculate vector factors.
+  unsigned getMemAccessWidth(Value *V);
+
 private:
   struct TreeEntry;
 
@@ -3139,10 +3143,73 @@
   BS->ScheduleStart = nullptr;
 }
 
+unsigned BoUpSLP::getMemAccessWidth(Value *V) {
+  auto &DL = F->getParent()->getDataLayout();
+
+  // If V is a store, just return the width of the stored value without
+  // traversing the expression tree. This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return DL.getTypeSizeInBits(Store->getValueOperand()->getType());
+
+  // If V is not a store, we can traverse the expression tree to find loads
+  // that feed it. The type of the loaded value may indicate a more suitable
+  // width than V's type. We want to base the vector factor on the width of
+  // memory operations where possible.
+  SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 16> Visited;
+  if (auto *I = dyn_cast<Instruction>(V))
+    Worklist.push_back(I);
+
+  // Traverse the expression tree in bottom-up order looking for loads. If we
+  // encounter an instruciton we don't yet handle, we give up.
+  auto MaxWidth = 0u;
+  auto FoundUnknownInst = false;
+  while (!Worklist.empty() && !FoundUnknownInst) {
+    auto *I = Worklist.pop_back_val();
+    Visited.insert(I);
+
+    // We should only be looking at scalar instructions here. If the current
+    // instruction has a vector type, give up.
+    auto *Ty = I->getType();
+    if (isa<VectorType>(Ty))
+      FoundUnknownInst = true;
+
+    // If the current instruction is a load, update MaxWidth to reflect the
+    // width of the loaded value.
+    else if (isa<LoadInst>(I))
+      MaxWidth = std::max(MaxWidth, (unsigned) DL.getTypeSizeInBits(Ty));
+
+    // Otherwise, we need to visit the operands of the instruction. We only
+    // handle the interesting cases from buildTree here. If an operand is an
+    // instruction we haven't yet visited, we add it to the worklist.
+    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+      for (Use &U : I->operands())
+       if (auto *J = dyn_cast<Instruction>(U.get()))
+         if (!Visited.count(J))
+           Worklist.push_back(J);
+    }
+
+    // If we don't yet handle the instruction, give up.
+    else
+      FoundUnknownInst = true;
+  }
+
+  // If we didn't encounter a memory access in the expression tree, or if we
+  // gave up for some reason, just return the width of V.
+  if (!MaxWidth || FoundUnknownInst)
+    return DL.getTypeSizeInBits(V->getType());
+
+  // Otherwise, return the maximum width we found.
+  return MaxWidth;
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   typedef SmallVector<StoreInst *, 8> StoreList;
   typedef MapVector<Value *, StoreList> StoreListMap;
+  typedef SmallVector<WeakVH, 8> LoadList;
+  typedef MapVector<Value *, LoadList> LoadListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -3173,6 +3240,7 @@
     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
     StoreRefs.clear();
+    LoadRefs.clear();
     bool Changed = false;
 
     // If the target claims to have no vector registers don't attempt
@@ -3206,15 +3274,24 @@
 
     // Scan the blocks in the function in post order.
     for (auto BB : post_order(&F.getEntryBlock())) {
+
       // Vectorize trees that end at stores.
-      if (unsigned count = collectStores(BB, R)) {
-        (void)count;
-        DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
+      auto NumStore = collectStores(BB, R);
+      if (NumStore > 0) {
+        DEBUG(dbgs() << "SLP: Found " << NumStore << " stores to vectorize.\n");
         Changed |= vectorizeStoreChains(R);
       }
 
       // Vectorize trees that end at reductions.
       Changed |= vectorizeChainsInBlock(BB, R);
+
+      // Vectorize trees that end at loads. This is intended to catch
+      // gather-like idioms ending at non-consecutive loads.
+      auto NumLoad = collectLoads(BB, R);
+      if (NumLoad > 0) {
+        DEBUG(dbgs() << "SLP: Found " << NumLoad << " loads to vectorize.\n");
+        Changed |= vectorizeLoadChains(BB, R);
+      }
     }
 
     if (Changed) {
@@ -3240,12 +3317,15 @@
 
 private:
 
-  /// \brief Collect memory references and sort them according to their base
-  /// object. We sort the stores to their base objects to reduce the cost of the
-  /// quadratic search on the stores. TODO: We can further reduce this cost
-  /// if we flush the chain creation every time we run into a memory barrier.
+  /// \brief Collect stores and sort them according to their base object. We
+  /// sort the stores by their base objects to reduce the cost of the quadratic
+  /// search on the stores. TODO: We can further reduce this cost if we flush
+  /// the chain creation every time we run into a memory barrier.
   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
 
+  /// \brief Collect loads and sort them according to their base object.
+  unsigned collectLoads(BasicBlock *BB, BoUpSLP &R);
+
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
 
@@ -3263,6 +3343,9 @@
   /// \brief Vectorize the stores that were collected in StoreRefs.
   bool vectorizeStoreChains(BoUpSLP &R);
 
+  /// \brief Vectorize the loads that were collected in LoadRefs.
+  bool vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R);
+
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
@@ -3274,6 +3357,7 @@
                        BoUpSLP &R);
 private:
   StoreListMap StoreRefs;
+  LoadListMap LoadRefs;
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
 };
 
@@ -3294,9 +3378,7 @@
   unsigned ChainLen = Chain.size();
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
-  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned Sz = R.getMemAccessWidth(Chain[0]);
   unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3407,6 +3489,33 @@
   return Changed;
 }
 
+unsigned SLPVectorizer::collectLoads(BasicBlock *BB, BoUpSLP &R) {
+  unsigned count = 0;
+  LoadRefs.clear();
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  for (Instruction &I : *BB) {
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI)
+      continue;
+
+    // Don't touch volatile loads.
+    if (!LI->isSimple())
+      continue;
+
+    // Check that the pointer points to scalars.
+    Type *Ty = LI->getType();
+    if (!isValidElementType(Ty))
+      continue;
+
+    // Find the base pointer.
+    Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL);
+
+    // Save the load locations.
+    LoadRefs[Ptr].push_back(LI);
+    count++;
+  }
+  return count;
+}
 
 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   unsigned count = 0;
@@ -3457,12 +3566,10 @@
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
-  const DataLayout &DL = I0->getModule()->getDataLayout();
 
-  Type *Ty0 = I0->getType();
-  unsigned Sz = DL.getTypeSizeInBits(Ty0);
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
+  unsigned Sz = R.getMemAccessWidth(I0);
   unsigned VF = MinVecRegSize / Sz;
 
   for (Value *V : VL) {
@@ -4170,6 +4277,78 @@
   return Changed;
 }
 
+bool SLPVectorizer::vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R) {
+  auto Changed = false;
+  auto &DL = BB->getModule()->getDataLayout();
+  for (auto &Entry : LoadRefs) {
+
+    // If the load list has fewer than two elements, there's nothing to do.
+    if (Entry.second.size() < 2)
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing a load chain of length "
+                 << Entry.second.size() << ".\n");
+
+    // If the load list accesses contiguous memory, it doesn't make sense to
+    // pursue it further in a bottom-up phase. It should have already been
+    // vectorized if profitable. The code below gives up if a single pair of
+    // consecutive accesses is found.
+    auto IsConsecutive = false;
+    for (unsigned I = 0; I < Entry.second.size() && !IsConsecutive; ++I)
+      for (unsigned J = 0; J < Entry.second.size() && !IsConsecutive; ++J)
+        if (I != J)
+          if (R.isConsecutiveAccess(Entry.second[I], Entry.second[J], DL))
+            IsConsecutive = true;
+    if (IsConsecutive)
+      continue;
+
+    // Try and vectorize trees ending at non-consecutive loads. We are currenly
+    // only interested in gather-like cases of the form:
+    //
+    // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+    //
+    // where the loads of "a", the loads of "b", and the subtractions can be
+    // performed in parallel. It's likely that detecting this pattern
+    // in a bottom-up phase will be simpler and less costly than building a
+    // full-blown top-down phase beginning at the consecutive loads.
+    SmallVector<Value *, 8> Bundle;
+    for (auto VH : Entry.second) {
+
+      // The load may have been vectorized after we initially collected it. If
+      // so, the WeakVH will have nullified the value.
+      auto *Load = dyn_cast_or_null<LoadInst>(VH);
+      if (!Load)
+        break;
+
+      // We only handle loads in which the pointer operand is a getelementptr.
+      // If we encounter anything else, give up.
+      auto *GEP = dyn_cast<GetElementPtrInst>(Load->getPointerOperand());
+      if (!GEP)
+        break;
+
+      // Furthermore, we only handle getelementptr instructions having a single
+      // non-constant index. Constant indices are not interesting, and we limit
+      // ourselves to getelementptr's with a single index to save compile time.
+      // If we encounter anything else, give up.
+      //
+      // TODO: Relax these restrictions if we find an interesting case that we
+      //       miss and compile time is not an issue.
+      auto *Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        break;
+      Bundle.push_back(Idx);
+    }
+
+    // If the bundle has fewer than two elements, there's nothing to do.
+    if (Bundle.size() < 2)
+      continue;
+
+    // Try to vectorize the bundle.
+    Changed |= tryToVectorizeList(Bundle, R);
+  }
+  return Changed;
+}
+
 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
Index: test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -0,0 +1,262 @@
+; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we vectorize the index calculations in the
+; gather-reduce pattern shown below. We check cases having i32 and i64
+; subtraction.
+;
+; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[*a++ - *b++]; sum += g[*a++ - *b++];
+;     sum += g[*a++ - *b++]; sum += g[*a++ - *b++];
+;     sum += g[*a++ - *b++]; sum += g[*a++ - *b++];
+;     sum += g[*a++ - *b++]; sum += g[*a++ - *b++];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @gather_reduce_8x16_i32
+;
+; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
+; CHECK: zext <8 x i16> [[L]] to <8 x i32>
+; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
+; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
+; CHECK: sext i32 [[X]] to i64
+;
+define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i32
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1
+  %1 = load i16, i16* %b.addr.0100, align 2
+  %conv2 = zext i16 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i32
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i32
+  %sub8 = sub nsw i32 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i32
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i32
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i32
+  %sub26 = sub nsw i32 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i32
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i32
+  %sub35 = sub nsw i32 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i32
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i32
+  %sub44 = sub nsw i32 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i32
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i32
+  %sub53 = sub nsw i32 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i32
+  %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i32
+  %sub62 = sub nsw i32 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: @gather_reduce_8x16_i64
+;
+; CHECK-NOT: load <8 x i16>
+;
+; FIXME: We are currently unable to vectorize the case with i64 subtraction
+;        because the zero extensions are too expensive. The solution here is to
+;        convert the i64 subtractions to i32 subtractions during vectorization.
+;        This would then match the case above.
+;
+define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i64
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1
+  %1 = load i16, i16* %b.addr.0100, align 2
+  %conv2 = zext i16 %1 to i64
+  %sub = sub nsw i64 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i64
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i64
+  %sub8 = sub nsw i64 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i64
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i64
+  %sub17 = sub nsw i64 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i64
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i64
+  %sub26 = sub nsw i64 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i64
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i64
+  %sub35 = sub nsw i64 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i64
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i64
+  %sub44 = sub nsw i64 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i64
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i64
+  %sub53 = sub nsw i64 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i64
+  %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i64
+  %sub62 = sub nsw i64 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}