Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -837,6 +837,50 @@
   return nullptr;
 }
 
+/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlyingvalue that was "cast", or nullptr otherwise.
+static Value* likeBitCastFromVector(InstCombiner &IC, Value* V) {
+  Value *U = nullptr;
+  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+    auto *EI = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+    if (!EI) 
+      return nullptr;
+    auto *W = EI->getVectorOperand();
+    if (!U) {
+      U = W;
+    } else if (U != W)
+      return nullptr;  
+    auto *CI = dyn_cast<ConstantInt>(EI->getIndexOperand());
+    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+      return nullptr;
+    V = IV->getAggregateOperand();
+  }
+  if (!isa<UndefValue>(V) ||!U)
+    return nullptr;
+
+  VectorType *UT = cast<VectorType>(U->getType());
+  Type *VT = V->getType();
+  // Check that types UT and VT are bitwise isomorphic.
+  const DataLayout &DL = IC.getDataLayout();
+  if (DL.getTypeSizeInBits(UT) != DL.getTypeSizeInBits(VT)) {
+    return nullptr;
+  }
+  if (ArrayType *AT = dyn_cast<ArrayType>(VT)) {
+    if (AT->getNumElements() != UT->getNumElements())
+      return nullptr;
+  } else {
+    StructType *ST = cast<StructType>(VT);
+    if (ST->getNumElements() != UT->getNumElements())
+      return nullptr;
+    for (const Type *EltT : ST->elements()) {
+      if (EltT != UT->getElementType())
+        return nullptr;
+    }
+  }
+  return U;
+}
+
 /// \brief Combine stores to match the type of value being stored.
 ///
 /// The core idea here is that the memory does not have any intrinsic type and
@@ -872,6 +916,11 @@
     return true;
   }
 
+  if (Value *U = likeBitCastFromVector(IC, V)) {
+    combineStoreToNewValue(IC, SI, U);
+    return true;
+  }
+
   // FIXME: We should also canonicalize loads of vectors when their elements are
   // cast to other types.
   return false;
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -273,32 +273,93 @@
   return Ty;
 }
 
+/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+/// 
+/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+static unsigned canMapToVector(Type *T, const DataLayout &DL) {
+  unsigned N; 
+  auto *ST = dyn_cast<StructType>(T);
+  if (ST) {
+    N = ST->getNumElements(); 
+  } else {
+    N = cast<ArrayType>(T)->getNumElements();
+  } 
+  if (N != 2 && N != 4 && N != 8 && N != 16)
+    // Reject other vector lengths as generally unprofitable.
+    return 0;
+  Type *EltT;
+  if (ST) {
+    auto I = ST->element_begin();
+    EltT = *I;
+    // Check that struct is homogeneous
+    auto E = ST->element_end();
+    while (++I != E)
+      if (*I != EltT)
+        return 0;
+  } else { 
+    EltT = cast<ArrayType>(T)->getElementType();
+  }
+  if (!VectorType::isValidElementType(EltT))
+    return 0;
+  VectorType *VT = VectorType::get(EltT, N); 
+  if (DL.getTypeSizeInBits(VT) != DL.getTypeSizeInBits(T)) {
+    return 0;
+  }
+  return N;
+}
+
+/// \returns True if Extract{Value,Element} instruction extracts element i.
+static bool matchExtractIndex(Instruction *E, unsigned i, unsigned Opcode) {
+  if (Opcode==Instruction::ExtractElement) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+    return CI && CI->getZExtValue() == i;
+  } else {
+    assert(Opcode == Instruction::ExtractValue);
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    assert(EI->getNumIndices()==1);
+    return *EI->idx_begin()==i;
+  }
+}
+
 /// \returns True if the ExtractElement instructions in VL can be vectorized
 /// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
-  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement || 
+         Opcode == Instruction::ExtractValue);
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+  Instruction *E0 = cast<Instruction>(VL0);
   Value *Vec = E0->getOperand(0);
 
-  // We have to extract from the same vector type.
-  unsigned NElts = Vec->getType()->getVectorNumElements();
+  // We have to extract from an vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if( Opcode == Instruction::ExtractValue) { 
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check for load can be rewritten as load of vector
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
+    NElts = Vec->getType()->getVectorNumElements();
+  }
 
   if (NElts != VL.size())
     return false;
 
   // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
+  if (!matchExtractIndex(E0, 0, Opcode))
     return false;
 
   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
-    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+    Instruction *E = cast<Instruction>(VL[i]);
+    if (!matchExtractIndex(E, i, Opcode))
+      return false;
+    if (E->getOperand(0) != Vec)
       return false;
   }
 
@@ -1145,8 +1206,9 @@
       }
       return;
     }
+    case Instruction::ExtractValue: 
     case Instruction::ExtractElement: {
-      bool Reuse = CanReuseExtract(VL);
+      bool Reuse = CanReuseExtract(VL, Opcode);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
       } else {
@@ -1488,11 +1550,12 @@
     case Instruction::PHI: {
       return 0;
     }
+    case Instruction::ExtractValue: 
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(VL)) {
+      if (CanReuseExtract(VL, Opcode)) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+          Instruction *E = cast<Instruction>(VL[i]);
           if (E->hasOneUse())
             // Take credit for instruction that will become dead.
             DeadCost +=
@@ -2232,13 +2295,24 @@
     }
 
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(E->Scalars)) {
+      if (CanReuseExtract(E->Scalars, Instruction::ExtractElement)) {
         Value *V = VL0->getOperand(0);
         E->VectorizedValue = V;
         return V;
       }
       return Gather(E->Scalars, VecTy);
     }
+    case Instruction::ExtractValue: {
+      if (CanReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+        LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+        Builder.SetInsertPoint(LI);
+        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+        LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+        E->VectorizedValue = V;
+        return propagateMetadata(V, E->Scalars);
+      }
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -3536,13 +3610,14 @@
         for (auto &V : BuildVectorSlice) {
           IRBuilder<true, NoFolder> Builder(
               InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
-          InsertElementInst *IE = cast<InsertElementInst>(V);
+          Instruction *I = cast<Instruction>(V);
+          assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
               VectorizedRoot, Builder.getInt32(VecIdx++)));
-          IE->setOperand(1, Extract);
-          IE->removeFromParent();
-          IE->insertAfter(Extract);
-          InsertAfter = IE;
+          I->setOperand(1, Extract);
+          I->removeFromParent();
+          I->insertAfter(Extract);
+          InsertAfter = I;
         }
       }
       // Move to the next bundle.
@@ -3933,6 +4008,25 @@
   return false;
 }
 
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches
+static bool findBuildAggregate(InsertValueInst *IV,
+                               SmallVectorImpl<Value *> &BuildVector,
+                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!IV->hasOneUse())
+    return false;    
+  Value *V = IV->getAggregateOperand();
+  if (!isa<UndefValue>(V)) {
+    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds) )
+      return false;
+  }
+  BuildVector.push_back(IV);
+  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  return true;
+}
+
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
@@ -4172,6 +4266,29 @@
 
       continue;
     }
+
+    // Try to vectorize trees that start at insertvalue instructions feeding into 
+    // a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+        const DataLayout &DL = BB->getModule()->getDataLayout();
+        if (canMapToVector(SI->getValueOperand()->getType(), DL)) {
+          SmallVector<Value *, 16> BuildVector;
+          SmallVector<Value *, 16> BuildVectorOpds;
+          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) {
+            continue;      
+          }
+
+          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+          }
+          continue;
+        }
+      }
+    }
   }
 
   return Changed;
Index: test/Transforms/InstCombine/insert-val-extract-elem.ll
===================================================================
--- test/Transforms/InstCombine/insert-val-extract-elem.ll
+++ test/Transforms/InstCombine/insert-val-extract-elem.ll
@@ -0,0 +1,33 @@
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+; CHECK-NOT: insertvalue
+; CHECK-NOT: extractelement
+; CHECK: store <2 x double>
+define void @julia_2xdouble([2 x double]* sret, <2 x double>*) {
+top:
+  %x = load <2 x double>, <2 x double>* %1
+  %x0 = extractelement <2 x double> %x, i32 0
+  %i0 = insertvalue [2 x double] undef, double %x0, 0
+  %x1 = extractelement <2 x double> %x, i32 1
+  %i1 = insertvalue [2 x double] %i0, double %x1, 1
+  store [2 x double] %i1, [2 x double]* %0, align 4
+  ret void
+}
+
+; CHECK-NOT: insertvalue
+; CHECK-NOT: extractelement
+; CHECK: store <4 x float>
+define void @julia_4xfloat([4 x float]* sret, <4 x float>*) {
+top:
+  %x = load <4 x float>, <4 x float>* %1
+  %x0 = extractelement <4 x float> %x, i32 0
+  %i0 = insertvalue [4 x float] undef, float %x0, 0
+  %x1 = extractelement <4 x float> %x, i32 1
+  %i1 = insertvalue [4 x float] %i0, float %x1, 1
+  %x2 = extractelement <4 x float> %x, i32 2
+  %i2 = insertvalue [4 x float] %i1, float %x2, 2
+  %x3 = extractelement <4 x float> %x, i32 3
+  %i3 = insertvalue [4 x float] %i2, float %x3, 3
+  store [4 x float] %i3, [4 x float]* %0, align 4
+  ret void
+}
Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+;CHECK: load <2 x double>
+;CHECK: load <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: fadd <2 x double>
+define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+top:
+  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
+  %x0 = load double, double* %px0, align 4
+  %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
+  %y0 = load double, double* %py0, align 4
+  %m0 = fmul double %x0, %y0
+  %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
+  %x1 = load double, double* %px1, align 4
+  %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
+  %y1 = load double, double* %py1, align 4
+  %m1 = fmul double %x1, %y1
+  %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
+  %z0 = load double, double* %pz0, align 4
+  %a0 = fadd double %m0, %z0
+  %i0 = insertvalue [2 x double] undef, double %a0, 0
+  %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
+  %z1 = load double, double* %pz1, align 4
+  %a1 = fadd double %m1, %z1
+  %i1 = insertvalue [2 x double] %i0, double %a1, 1
+  store [2 x double] %i1, [2 x double]* %0, align 4
+  ret void
+}
+
+;CHECK: load <4 x float>
+;CHECK: load <4 x float>
+;CHECK: fmul <4 x float>
+;CHECK: fadd <4 x float>
+define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+top:
+  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
+  %x0 = load float, float* %px0, align 4
+  %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
+  %y0 = load float, float* %py0, align 4
+  %m0 = fmul float %x0, %y0
+  %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
+  %x1 = load float, float* %px1, align 4
+  %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
+  %y1 = load float, float* %py1, align 4
+  %m1 = fmul float %x1, %y1
+  %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
+  %x2 = load float, float* %px2, align 4
+  %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
+  %y2 = load float, float* %py2, align 4
+  %m2 = fmul float %x2, %y2
+  %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
+  %x3 = load float, float* %px3, align 4
+  %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
+  %y3 = load float, float* %py3, align 4
+  %m3 = fmul float %x3, %y3
+  %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
+  %z0 = load float, float* %pz0, align 4
+  %a0 = fadd float %m0, %z0
+  %i0 = insertvalue [4 x float] undef, float %a0, 0
+  %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
+  %z1 = load float, float* %pz1, align 4
+  %a1 = fadd float %m1, %z1
+  %i1 = insertvalue [4 x float] %i0, float %a1, 1
+  %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
+  %z2 = load float, float* %pz2, align 4
+  %a2 = fadd float %m2, %z2
+  %i2 = insertvalue [4 x float] %i1, float %a2, 2
+  %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
+  %z3 = load float, float* %pz3, align 4
+  %a3 = fadd float %m3, %z3
+  %i3 = insertvalue [4 x float] %i2, float %a3, 3
+  store [4 x float] %i3, [4 x float]* %0, align 4
+  ret void
+}
+
+;CHECK: fsub <4 x float>
+define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+top:
+  %a_arr = load [4 x float], [4 x float]* %a, align 4
+  %a0 = extractvalue [4 x float] %a_arr, 0
+  %a2 = extractvalue [4 x float] %a_arr, 2
+  %a1 = extractvalue [4 x float] %a_arr, 1
+  %b_arr = load [4 x float], [4 x float]* %b, align 4
+  %b0 = extractvalue [4 x float] %b_arr, 0
+  %b2 = extractvalue [4 x float] %b_arr, 2
+  %b1 = extractvalue [4 x float] %b_arr, 1
+  %a3 = extractvalue [4 x float] %a_arr, 3
+  %c1 = fsub float %a1, %b1
+  %b3 = extractvalue [4 x float] %b_arr, 3
+  %c0 = fsub float %a0, %b0
+  %c2 = fsub float %a2, %b2
+  %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
+  %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
+  %c3 = fsub float %a3, %b3
+  %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
+  %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
+  store [4 x float] %c_arr3, [4 x float]* %c, align 4
+  ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+;CHECK: fsub <4 x float>
+define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+top:
+  %a_struct = load %pseudovec, %pseudovec* %a, align 4
+  %a0 = extractvalue %pseudovec %a_struct, 0
+  %a1 = extractvalue %pseudovec %a_struct, 1
+  %b_struct = load %pseudovec, %pseudovec* %b, align 4
+  %a2 = extractvalue %pseudovec %a_struct, 2
+  %b0 = extractvalue %pseudovec %b_struct, 0
+  %a3 = extractvalue %pseudovec %a_struct, 3
+  %c0 = fsub float %a0, %b0
+  %b1 = extractvalue %pseudovec %b_struct, 1
+  %b2 = extractvalue %pseudovec %b_struct, 2
+  %c1 = fsub float %a1, %b1
+  %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
+  %b3 = extractvalue %pseudovec %b_struct, 3
+  %c3 = fsub float %a3, %b3
+  %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
+  %c2 = fsub float %a2, %b2
+  %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
+  %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
+  store %pseudovec %c_struct3, %pseudovec* %c, align 4
+  ret void
+}
+