Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -278,36 +278,17 @@
   return Ty;
 }
 
-/// \returns True if the ExtractElement instructions in VL can be vectorized
-/// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
-  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
-  // Check if all of the extracts come from the same vector and from the
-  // correct offset.
-  Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
-  Value *Vec = E0->getOperand(0);
-
-  // We have to extract from the same vector type.
-  unsigned NElts = Vec->getType()->getVectorNumElements();
-
-  if (NElts != VL.size())
-    return false;
-
-  // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
-    return false;
-
-  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  if (Opcode == Instruction::ExtractElement) {
     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
-      return false;
+    return CI && CI->getZExtValue() == Idx;
+  } else {
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
   }
-
-  return true;
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
@@ -373,6 +354,18 @@
         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
         DL(DL), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+    MinVecRegSize = MinVectorRegSizeOption;
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -426,6 +419,21 @@
   /// vectorizable tree.
   void computeMinimumValueSizes();
 
+  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+  unsigned getMaxVecRegSize() const {
+    return MaxVecRegSize;
+  }
+
+  // \returns minimum vector register size as set by cl::opt.
+  unsigned getMinVecRegSize() const {
+    return MinVecRegSize;
+  }
+
+  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+  ///
+  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
 private:
   struct TreeEntry;
 
@@ -435,6 +443,10 @@
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 
+  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
@@ -926,6 +938,8 @@
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
   IRBuilder<> Builder;
 
@@ -1159,8 +1173,9 @@
       }
       return;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      bool Reuse = CanReuseExtract(VL);
+      bool Reuse = canReuseExtract(VL, Opcode);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
       } else {
@@ -1477,6 +1492,74 @@
   }
 }
 
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+  unsigned N;
+  Type *EltTy;
+  auto *ST = dyn_cast<StructType>(T);
+  if (ST) {
+    N = ST->getNumElements();
+    EltTy = *ST->element_begin();
+  } else {
+    N = cast<ArrayType>(T)->getNumElements();
+    EltTy = cast<ArrayType>(T)->getElementType();
+  }
+  if (!isValidElementType(EltTy))
+    return 0;
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+    return 0;
+  if (ST) {
+    // Check that struct is homogeneous
+    for (const auto *Ty : ST->elements())
+      if (Ty != EltTy)
+        return 0;
+  }
+  return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  Instruction *E0 = cast<Instruction>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from a vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if (Opcode == Instruction::ExtractValue) {
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check if load can be rewritten as load of vector
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
+    NElts = Vec->getType()->getVectorNumElements();
+  }
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  if (!matchExtractIndex(E0, 0, Opcode))
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    Instruction *E = cast<Instruction>(VL[i]);
+    if (!matchExtractIndex(E, i, Opcode))
+      return false;
+    if (E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -1506,11 +1589,12 @@
     case Instruction::PHI: {
       return 0;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(VL)) {
+      if (canReuseExtract(VL, Opcode)) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+          Instruction *E = cast<Instruction>(VL[i]);
           if (E->hasOneUse())
             // Take credit for instruction that will become dead.
             DeadCost +=
@@ -2198,13 +2282,25 @@
     }
 
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(E->Scalars)) {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
         Value *V = VL0->getOperand(0);
         E->VectorizedValue = V;
         return V;
       }
       return Gather(E->Scalars, VecTy);
     }
+    case Instruction::ExtractValue: {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+        LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+        Builder.SetInsertPoint(LI);
+        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+        LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+        E->VectorizedValue = V;
+        return propagateMetadata(V, E->Scalars);
+      }
+      return Gather(E->Scalars, VecTy);
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -3418,19 +3514,6 @@
     if (!TTI->getNumberOfRegisters(true))
       return false;
 
-    // Use the vector register size specified by the target unless overridden
-    // by a command-line option.
-    // TODO: It would be better to limit the vectorization factor based on
-    //       data type rather than just register size. For example, x86 AVX has
-    //       256-bit registers, but it does not support integer operations
-    //       at that width (that requires AVX2).
-    if (MaxVectorRegSizeOption.getNumOccurrences())
-      MaxVecRegSize = MaxVectorRegSizeOption;
-    else
-      MaxVecRegSize = TTI->getRegisterBitWidth(true);
-
-    MinVecRegSize = MinVectorRegSizeOption;
-
     // Don't vectorize when the attribute NoImplicitFloat is used.
     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
       return false;
@@ -3538,9 +3621,6 @@
 
   /// The getelementptr instructions in a basic block organized by base pointer.
   WeakVHListMap GEPs;
-
-  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
-  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
 };
 
 /// \brief Check that the Values in the slice in VL array are still existent in
@@ -3658,7 +3738,7 @@
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
-    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+    for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
       if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
@@ -3731,7 +3811,7 @@
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
   unsigned Sz = R.getVectorElementSize(I0);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = R.getMinVecRegSize() / Sz;
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
@@ -3798,13 +3878,14 @@
         for (auto &V : BuildVectorSlice) {
           IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                                       ++BasicBlock::iterator(InsertAfter));
-          InsertElementInst *IE = cast<InsertElementInst>(V);
+          Instruction *I = cast<Instruction>(V);
+          assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
               VectorizedRoot, Builder.getInt32(VecIdx++)));
-          IE->setOperand(1, Extract);
-          IE->removeFromParent();
-          IE->insertAfter(Extract);
-          InsertAfter = IE;
+          I->setOperand(1, Extract);
+          I->removeFromParent();
+          I->insertAfter(Extract);
+          InsertAfter = I;
         }
       }
       // Move to the next bundle.
@@ -4201,6 +4282,25 @@
   return false;
 }
 
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+                               SmallVectorImpl<Value *> &BuildVector,
+                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!IV->hasOneUse())
+    return false;
+  Value *V = IV->getAggregateOperand();
+  if (!isa<UndefValue>(V)) {
+    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+      return false;
+  }
+  BuildVector.push_back(IV);
+  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  return true;
+}
+
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
@@ -4357,7 +4457,7 @@
         continue;
 
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) {
+      if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -4385,7 +4485,7 @@
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
           if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          MinVecRegSize) ||
+                                          R.getMinVecRegSize()) ||
               tryToVectorize(BinOp, R)) {
             Changed = true;
             it = BB->begin();
@@ -4453,6 +4553,28 @@
 
       continue;
     }
+
+    // Try to vectorize trees that start at insertvalue instructions feeding into
+    // a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+        const DataLayout &DL = BB->getModule()->getDataLayout();
+        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+          SmallVector<Value *, 16> BuildVector;
+          SmallVector<Value *, 16> BuildVectorOpds;
+          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+            continue;
+
+          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+          }
+          continue;
+        }
+      }
+    }
   }
 
   return Changed;
Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -0,0 +1,189 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+; CHECK-LABEL: julia_2xdouble
+; CHECK: load <2 x double>
+; CHECK: load <2 x double>
+; CHECK: fmul <2 x double>
+; CHECK: fadd <2 x double>
+define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+top:
+  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
+  %x0 = load double, double* %px0, align 4
+  %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
+  %y0 = load double, double* %py0, align 4
+  %m0 = fmul double %x0, %y0
+  %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
+  %x1 = load double, double* %px1, align 4
+  %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
+  %y1 = load double, double* %py1, align 4
+  %m1 = fmul double %x1, %y1
+  %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
+  %z0 = load double, double* %pz0, align 4
+  %a0 = fadd double %m0, %z0
+  %i0 = insertvalue [2 x double] undef, double %a0, 0
+  %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
+  %z1 = load double, double* %pz1, align 4
+  %a1 = fadd double %m1, %z1
+  %i1 = insertvalue [2 x double] %i0, double %a1, 1
+  store [2 x double] %i1, [2 x double]* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_4xfloat
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: fadd <4 x float>
+define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+top:
+  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
+  %x0 = load float, float* %px0, align 4
+  %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
+  %y0 = load float, float* %py0, align 4
+  %m0 = fmul float %x0, %y0
+  %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
+  %x1 = load float, float* %px1, align 4
+  %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
+  %y1 = load float, float* %py1, align 4
+  %m1 = fmul float %x1, %y1
+  %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
+  %x2 = load float, float* %px2, align 4
+  %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
+  %y2 = load float, float* %py2, align 4
+  %m2 = fmul float %x2, %y2
+  %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
+  %x3 = load float, float* %px3, align 4
+  %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
+  %y3 = load float, float* %py3, align 4
+  %m3 = fmul float %x3, %y3
+  %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
+  %z0 = load float, float* %pz0, align 4
+  %a0 = fadd float %m0, %z0
+  %i0 = insertvalue [4 x float] undef, float %a0, 0
+  %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
+  %z1 = load float, float* %pz1, align 4
+  %a1 = fadd float %m1, %z1
+  %i1 = insertvalue [4 x float] %i0, float %a1, 1
+  %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
+  %z2 = load float, float* %pz2, align 4
+  %a2 = fadd float %m2, %z2
+  %i2 = insertvalue [4 x float] %i1, float %a2, 2
+  %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
+  %z3 = load float, float* %pz3, align 4
+  %a3 = fadd float %m3, %z3
+  %i3 = insertvalue [4 x float] %i2, float %a3, 3
+  store [4 x float] %i3, [4 x float]* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_float
+; CHECK: fsub <4 x float>
+define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+top:
+  %a_arr = load [4 x float], [4 x float]* %a, align 4
+  %a0 = extractvalue [4 x float] %a_arr, 0
+  %a2 = extractvalue [4 x float] %a_arr, 2
+  %a1 = extractvalue [4 x float] %a_arr, 1
+  %b_arr = load [4 x float], [4 x float]* %b, align 4
+  %b0 = extractvalue [4 x float] %b_arr, 0
+  %b2 = extractvalue [4 x float] %b_arr, 2
+  %b1 = extractvalue [4 x float] %b_arr, 1
+  %a3 = extractvalue [4 x float] %a_arr, 3
+  %c1 = fsub float %a1, %b1
+  %b3 = extractvalue [4 x float] %b_arr, 3
+  %c0 = fsub float %a0, %b0
+  %c2 = fsub float %a2, %b2
+  %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
+  %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
+  %c3 = fsub float %a3, %b3
+  %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
+  %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
+  store [4 x float] %c_arr3, [4 x float]* %c, align 4
+  ret void
+}
+
+; CHECK-LABEL: julia_load_array_of_i32
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: sub <4 x i32>
+define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+top:
+  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
+  %a0 = extractvalue [4 x i32] %a_arr, 0
+  %a2 = extractvalue [4 x i32] %a_arr, 2
+  %a1 = extractvalue [4 x i32] %a_arr, 1
+  %b_arr = load [4 x i32], [4 x i32]* %b, align 4
+  %b0 = extractvalue [4 x i32] %b_arr, 0
+  %b2 = extractvalue [4 x i32] %b_arr, 2
+  %b1 = extractvalue [4 x i32] %b_arr, 1
+  %a3 = extractvalue [4 x i32] %a_arr, 3
+  %c1 = sub i32 %a1, %b1
+  %b3 = extractvalue [4 x i32] %b_arr, 3
+  %c0 = sub i32 %a0, %b0
+  %c2 = sub i32 %a2, %b2
+  %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
+  %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
+  %c3 = sub i32 %a3, %b3
+  %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
+  %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
+  store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
+  ret void
+}
+
+; Almost identical to previous test, but for type that should NOT be vectorized.
+;
+; CHECK-LABEL: julia_load_array_of_i16
+; CHECK-NOT: i2>
+define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+top:
+  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
+  %a0 = extractvalue [4 x i16] %a_arr, 0
+  %a2 = extractvalue [4 x i16] %a_arr, 2
+  %a1 = extractvalue [4 x i16] %a_arr, 1
+  %b_arr = load [4 x i16], [4 x i16]* %b, align 4
+  %b0 = extractvalue [4 x i16] %b_arr, 0
+  %b2 = extractvalue [4 x i16] %b_arr, 2
+  %b1 = extractvalue [4 x i16] %b_arr, 1
+  %a3 = extractvalue [4 x i16] %a_arr, 3
+  %c1 = sub i16 %a1, %b1
+  %b3 = extractvalue [4 x i16] %b_arr, 3
+  %c0 = sub i16 %a0, %b0
+  %c2 = sub i16 %a2, %b2
+  %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
+  %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
+  %c3 = sub i16 %a3, %b3
+  %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
+  %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
+  store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
+  ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+; CHECK-LABEL: julia_load_struct_of_float
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fsub <4 x float>
+define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+top:
+  %a_struct = load %pseudovec, %pseudovec* %a, align 4
+  %a0 = extractvalue %pseudovec %a_struct, 0
+  %a1 = extractvalue %pseudovec %a_struct, 1
+  %b_struct = load %pseudovec, %pseudovec* %b, align 4
+  %a2 = extractvalue %pseudovec %a_struct, 2
+  %b0 = extractvalue %pseudovec %b_struct, 0
+  %a3 = extractvalue %pseudovec %a_struct, 3
+  %c0 = fsub float %a0, %b0
+  %b1 = extractvalue %pseudovec %b_struct, 1
+  %b2 = extractvalue %pseudovec %b_struct, 2
+  %c1 = fsub float %a1, %b1
+  %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
+  %b3 = extractvalue %pseudovec %b_struct, 3
+  %c3 = fsub float %a3, %b3
+  %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
+  %c2 = fsub float %a2, %b2
+  %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
+  %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
+  store %pseudovec %c_struct3, %pseudovec* %c, align 4
+  ret void
+}