Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -155,6 +155,7 @@
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
+
   AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
   FunctionType *FTy = F.getFunctionType();
@@ -395,11 +396,28 @@
   return GEP->getOperand(2);
 }
 
-// Not an instruction handled below to turn into a vector.
+static Value* EVToVectorIndex(ExtractValueInst *EV, Module *Mod) {
+  // FIXME we only support simple cases
+  if (EV->getNumIndices() != 1)
+    return nullptr;
+
+  return ConstantInt::get(Type::getInt32Ty(Mod->getContext()), EV->getIndices()[0]);
+}
+
+static Value* IVToVectorIndex(InsertValueInst *IV, Module *Mod) {
+  // FIXME we only support simple cases
+  if (IV->getNumIndices() != 1)
+    return nullptr;
+
+  return ConstantInt::get(Type::getInt32Ty(Mod->getContext()), IV->getIndices()[0]);
+}
+
+// Check to ensure that we can vectorize a GEP user during transformation phase. Instructions in
+// this check will be of the original alloca array aggregate element base type
 //
 // TODO: Check isTriviallyVectorizable for calls and handle other
 // instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeSimpleInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
   case Instruction::Load:
   case Instruction::BitCast:
@@ -415,7 +433,261 @@
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+// Check that we can vectorize a load of an entire aggregate (at this stage arrays of 2,3, or 4
+// elements)
+// Using a check like this enables us to give up in cases we can't handle rather than breaking later
+// on during the transform phase
+// We can handle simple cases of this where all the uses are ExtractValue or Stores. These are the
+// observed cases from the front-ends so far
+// TODO: Extend to more cases
+static bool canVectorizeLoad(LoadInst *Inst, AllocaInst *Alloca, SmallSetVector<Value*,8> &WorkList,
+                             std::map<GetElementPtrInst*, Value*> &GEPVectorIdx) {
+  // Load the array scenario - we have to vectorize the uses as well
+  // Limit to extractvalue and store
+  for (User *LoadUser : Inst->users()) {
+    switch (cast<Instruction>(LoadUser)->getOpcode()) {
+    case Instruction::ExtractValue:
+      if (cast<ExtractValueInst>(LoadUser)->getNumIndices() != 1) return false;
+      break;
+    case Instruction::InsertValue:
+      if (cast<InsertValueInst>(LoadUser)->getNumIndices() != 1) return false;
+      break;
+    case Instruction::Store:
+      // We can handle these
+       break;
+    default:
+      // More complicated - we need to reject vectorization in this case
+      return false;
+    }
+  }
+  // Add to the WorkList of items to transform - we use a SetVector here so we can remove any
+  // duplicates (which is a possibility if we are back-tracking from a store)
+  WorkList.insert(Inst);
+  return true;
+}
+
+// Check that we can vectorize a store of an entire aggregate (at this stage arrays of 2,3, or 4
+// elements)
+// Using a check like this enables us to give up in cases we can't handle rather than breaking later
+// on during the transform phase
+// This case handles only where we are storing to the alloca directly. We backtrack from the store
+// to make sure that we only have insertvalues with one use, terminating with either a load or an
+// undef. This is quite limited, but is an example of the cases encountered with current front-ends
+// TODO: Extend to more cases
+static bool canVectorizeStore(StoreInst *Inst, AllocaInst *Alloca, SmallSetVector<Value*,8> &WorkList,
+                              std::map<GetElementPtrInst*, Value*> &GEPVectorIdx) {
+  // Must be the stored pointer operand, not a stored value.
+  Value *Ptr = Inst->getPointerOperand();
+  if (!(Ptr == Alloca))
+    return false;
+
+  // if (isa<GetElementPtrInst>(Ptr)) {
+  //   WorkList.insert(Inst);
+  //   return true;
+  // }
+  
+  // We can handle insertvalues going back to either an alloca that we convert to a vector or
+  // to an undef
+  Value *Val = Inst->getValueOperand();
+  while (isa<InsertValueInst>(Val)) {
+    if (!Val->hasOneUse() || cast<InsertValueInst>(Val)->getNumIndices() != 1)
+      // Can't deal with more than one use or more than 1 index (multi-dim array)
+      return false;
+    Val = cast<InsertValueInst>(Val)->getAggregateOperand();
+  }
+  if (isa<UndefValue>(Val)) {
+    // We can handle the store and add it to the WorkList
+    WorkList.insert(Inst);
+    return true;
+  }
+  
+  if (isa<LoadInst>(Val)) {
+    // We can handle this scenario
+    // Check that the load can be vectorized, this will be added to the WorkList as the starting
+    // point that will result in the store being handled as well
+    // We need to do this as there may be more than one use of the load
+    return canVectorizeLoad(cast<LoadInst>(Val), Alloca, WorkList, GEPVectorIdx);
+  }
+  
+  // Not a Value we handle
+  return false;
+}
+
+// Build a WorkList of instructions that require vectorization mods after transformation
+// In the event that something is encountered that prevents vectorization return false
+static bool canVectorizeInst(AllocaInst *Alloca, SmallSetVector<Value*,8> &WorkList,
+                             std::map<GetElementPtrInst*, Value*> &GEPVectorIdx) {
+
+  for (User *AllocaUser : Alloca->users()) {
+    Instruction *Inst = cast<Instruction>(AllocaUser);
+    switch (Inst->getOpcode()) {
+    case Instruction::GetElementPtr: {
+      GetElementPtrInst *GEP = cast<GetElementPtrInst>(AllocaUser);
+      Value *Index = GEPToVectorIndex(GEP);
+
+      // If we can't compute a vector index from this GEP, then we can't
+      // promote this alloca to vector.
+      if (!Index) {
+        DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+        return false;
+      }
+
+      GEPVectorIdx[GEP] = Index;
+      for (User *GEPUser : AllocaUser->users()) {
+        if (!canVectorizeSimpleInst(cast<Instruction>(GEPUser), AllocaUser))
+          return false;
+
+        WorkList.insert(GEPUser);
+      }
+      break;
+    }
+    case Instruction::Load: {
+      if (!canVectorizeLoad(cast<LoadInst>(Inst), Alloca, WorkList, GEPVectorIdx))
+        return false;
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast: {
+      WorkList.insert(Inst);
+      break;
+    }
+    case Instruction::Store: {
+      if (!canVectorizeStore(cast<StoreInst>(Inst), Alloca, WorkList, GEPVectorIdx))
+        return false;
+      break;
+    }
+    default:
+      // Can't handle this instruction - reject vectorization
+      return false;
+    }
+  }
+  return true;
+}
+
+static void promoteLoad(LoadInst *LdInst, AllocaInst *Alloca,
+                        VectorType *VectorTy, Type *VecPtrTy,
+                        const std::map<GetElementPtrInst*, Value*> &GEPVectorIdx,
+                        IRBuilder<> &Builder, Module *Mod) {
+  Value *Ptr = LdInst->getPointerOperand();
+  if (isa<GetElementPtrInst>(Ptr) && cast<GetElementPtrInst>(Ptr)->getResultElementType()->isSingleValueType()) {
+    Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+
+    Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+    Value *VecValue = Builder.CreateLoad(BitCast);
+    Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+    LdInst->replaceAllUsesWith(ExtractElement);
+    LdInst->eraseFromParent();
+  }
+  else {
+    // We have to deal with the alloca being loaded as an array value
+    // Load into a vector and replace any extractvalues with extractelement
+    Type *AlignedVecPtrTy = VectorTy->getPointerTo(LdInst->getPointerAddressSpace());
+    Value *BitCast = Builder.CreateBitCast(Ptr, AlignedVecPtrTy);
+    Value *VecValue = Builder.CreateLoad(BitCast);
+
+    std::vector<Instruction *> ToErase;
+    for (User *LoadUser : LdInst->users()) {
+      switch (cast<Instruction>(LoadUser)->getOpcode()) {
+      case Instruction::ExtractValue: {
+        ExtractValueInst *EVInst = cast<ExtractValueInst>(LoadUser);
+        IRBuilder<> EVBuilder(EVInst);
+        Value *ExtractElement = EVBuilder.CreateExtractElement(VecValue, EVToVectorIndex(EVInst, Mod));
+        EVInst->replaceAllUsesWith(ExtractElement);
+        ToErase.push_back(EVInst);
+        break;
+      }
+      case Instruction::InsertValue: {
+        InsertValueInst *IVInst = cast<InsertValueInst>(LoadUser);
+        IRBuilder<> IVBuilder(IVInst);
+        Value *InsertElement = IVBuilder.CreateInsertElement(VecValue,
+                                                             IVInst->getInsertedValueOperand(),
+                                                             IVToVectorIndex(IVInst, Mod));
+        IVInst->replaceAllUsesWith(InsertElement);
+        ToErase.push_back(IVInst);
+        break;
+      }
+      case Instruction::Store: {
+        StoreInst *StInst = cast<StoreInst>(LoadUser);
+        IRBuilder<> StBuilder(StInst);
+        // Make sure that the address spaces are the same
+        Type *VecPtrTyAddr = VectorTy->getPointerTo(StInst->getPointerAddressSpace());
+        Value *StBitCast = StBuilder.CreateBitCast(StInst->getOperand(1), VecPtrTyAddr);
+        StBuilder.CreateStore(VecValue, StBitCast);
+        ToErase.push_back(StInst);
+        break;
+      }
+      default:
+        llvm_unreachable("Inconsistency in instructions promotable to vector");
+      }
+    }
+    for (Instruction *Cand : ToErase)
+      Cand->eraseFromParent();
+    LdInst->eraseFromParent();
+  }
+}
+
+static void promoteStore(StoreInst *StInst, AllocaInst *Alloca,
+                         VectorType *VectorTy, Type *VecPtrTy,
+                         const std::map<GetElementPtrInst*, Value*> &GEPVectorIdx,
+                         IRBuilder<> &Builder, Module *Mod) {
+  Value *Ptr = StInst->getPointerOperand();
+  if (isa<GetElementPtrInst>(Ptr)) {
+    Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+    Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+    Value *VecValue = Builder.CreateLoad(BitCast);
+    Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                     StInst->getValueOperand(),
+                                                     Index);
+    Builder.CreateStore(NewVecValue, BitCast);
+    StInst->eraseFromParent();
+  }
+  else {
+    // We've already verified that there is chain of 1 or more InsertValue instructions leading
+    // to this store from an original undef
+    // Work back up the chain and replace the arrays with a vector
+    Value *Val = StInst->getValueOperand();
+    std::vector<Value *> IVWorkList;
+    while (isa<InsertValueInst>(Val)) {
+      IVWorkList.push_back(Val);
+      Val = cast<InsertValueInst>(Val)->getAggregateOperand();
+    }
+    assert(isa<UndefValue>(Val));
+
+    // Traverse the list forwards - replace the InsertValue's with InsertElements
+    // Start with a new undef of a vector rather than an array (aggregate)
+    // First create the new vector based undef
+    std::vector<Instruction *> ToErase;
+    Value *VecValue = UndefValue::get(VectorTy);
+    while (!IVWorkList.empty()) {
+      InsertValueInst *IVInst = cast<InsertValueInst>(IVWorkList.back());
+      IVWorkList.pop_back();
+
+      IRBuilder<> IVBuilder(IVInst);
+      VecValue = Builder.CreateInsertElement(VecValue,
+                                             IVInst->getInsertedValueOperand(),
+                                             IVToVectorIndex(IVInst, Mod));
+      ToErase.push_back(IVInst);
+    }
+    Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+    Builder.CreateStore(VecValue, BitCast);
+    StInst->eraseFromParent();
+    // Erase the InsertValues in reverse order to prevent used value erase error
+    while (!ToErase.empty()) {
+      ToErase.back()->eraseFromParent();
+      ToErase.pop_back();
+    }
+  }
+}
+
+// Attempt to turn an array aggregate alloca into vector operations
+// There are 2 phases in this function
+// Phase 1 - check that the transformation is possible and build a WorkList of Instructions to
+// transform
+// Phase 2 - perform the transformation from array aggregate uses to vector uses
+//
+// If the array aggregate or its uses are too complicated then return false (indicating that the
+// promotion operation failed)
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, Module *Mod, AMDGPUAS AS) {
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -424,6 +696,7 @@
   // are just being conservative for now.
   if (!AllocaTy ||
       AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getElementType()->isArrayTy() ||
       AllocaTy->getNumElements() > 4 ||
       AllocaTy->getNumElements() < 2) {
     DEBUG(dbgs() << "  Cannot convert type to vector\n");
@@ -431,36 +704,16 @@
   }
 
   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value*> WorkList;
-  for (User *AllocaUser : Alloca->users()) {
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
-    if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
-        return false;
-
-      WorkList.push_back(AllocaUser);
-      continue;
-    }
-
-    Value *Index = GEPToVectorIndex(GEP);
-
-    // If we can't compute a vector index from this GEP, then we can't
-    // promote this alloca to vector.
-    if (!Index) {
-      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
-      return false;
-    }
-
-    GEPVectorIdx[GEP] = Index;
-    for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
-        return false;
-
-      WorkList.push_back(GEPUser);
-    }
-  }
+  // Use a SetVector as there is a (valid) possibility of duplicates. Using a SetVector ensures that
+  // we handle in order but allow for duplication of work items
+  SmallSetVector<Value*,8> WorkList;
 
+  if (!canVectorizeInst(Alloca, WorkList, GEPVectorIdx))
+    // Something we don't/can't handle is present
+    return false;
+  
   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+  Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
 
   DEBUG(dbgs() << "  Converting alloca to vector "
         << *AllocaTy << " -> " << *VectorTy << '\n');
@@ -470,29 +723,11 @@
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-      Value *Ptr = Inst->getOperand(0);
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-
-      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
-      Value *VecValue = Builder.CreateLoad(BitCast);
-      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
-      Inst->replaceAllUsesWith(ExtractElement);
-      Inst->eraseFromParent();
+      promoteLoad(cast<LoadInst>(Inst), Alloca, VectorTy, VecPtrTy, GEPVectorIdx, Builder, Mod);
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-
-      Value *Ptr = Inst->getOperand(1);
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
-      Value *VecValue = Builder.CreateLoad(BitCast);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       Inst->getOperand(0),
-                                                       Index);
-      Builder.CreateStore(NewVecValue, BitCast);
-      Inst->eraseFromParent();
+      promoteStore(cast<StoreInst>(Inst), Alloca, VectorTy, VecPtrTy, GEPVectorIdx, Builder, Mod);
       break;
     }
     case Instruction::BitCast:
@@ -675,8 +910,8 @@
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I, AS)) {
-    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+  if (tryPromoteAllocaToVector(&I, Mod, AS)) {
+    DEBUG(dbgs() << " alloca has been vectorized.\n");
     return;
   }
 
Index: test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -0,0 +1,264 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
+
+; CHECK-LABEL: @promote_1d_aggr
+
+%Block = type { [1 x float], i32 }
+%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] }
+
+@block = external addrspace(7) global %Block
+@0 = external addrspace(6) global %gl_PerVertex
+
+; Function Attrs: nounwind
+define amdgpu_vs void @promote_1d_aggr() #0 {
+  %i = alloca i32
+  %f1 = alloca [1 x float]
+  %1 = getelementptr %Block, %Block addrspace(7)* @block, i32 0, i32 1
+  %2 = load i32, i32 addrspace(7)* %1
+  store i32 %2, i32* %i
+  %3 = getelementptr %Block, %Block addrspace(7)* @block, i32 0, i32 0
+  %4 = load [1 x float], [1 x float] addrspace(7)* %3
+  store [1 x float] %4, [1 x float]* %f1
+  %5 = load i32, i32* %i
+  %6 = getelementptr [1 x float], [1 x float]* %f1, i32 0, i32 %5
+  %7 = load float, float* %6
+  %8 = alloca <4 x float>
+  %9 = load <4 x float>, <4 x float>* %8
+  %10 = insertelement <4 x float> %9, float %7, i32 0
+  %11 = insertelement <4 x float> %10, float %7, i32 1
+  %12 = insertelement <4 x float> %11, float %7, i32 2
+  %13 = insertelement <4 x float> %12, float %7, i32 3
+  %14 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0
+  store <4 x float> %13, <4 x float> addrspace(6)* %14
+  ret void
+}
+
+
+; CHECK-LABEL: @promote_store_aggr
+; CHECK: store <2 x float> %14, <2 x float> addrspace(7)* %16
+
+%Block2 = type { i32, [2 x float] }
+@block2 = external addrspace(7) global %Block2
+
+; Function Attrs: nounwind
+define amdgpu_vs void @promote_store_aggr() #0 {
+  %i = alloca i32
+  %f1 = alloca [2 x float]
+  %1 = getelementptr %Block2, %Block2 addrspace(7)* @block2, i32 0, i32 0
+  %2 = load i32, i32 addrspace(7)* %1
+  store i32 %2, i32* %i
+  %3 = load i32, i32* %i
+  %4 = sitofp i32 %3 to float
+  %5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 0
+  store float %4, float* %5
+  %6 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 1
+  store float 2.000000e+00, float* %6
+  %7 = load [2 x float], [2 x float]* %f1
+  %8 = getelementptr %Block2, %Block2 addrspace(7)* @block2, i32 0, i32 1
+  store [2 x float] %7, [2 x float] addrspace(7)* %8
+  %9 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0
+  store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(6)* %9
+  ret void
+}
+
+; CHECK-LABEL: @promote_load_from_store_aggr
+; CHECK: %5 = load <2 x float>, <2 x float> addrspace(7)* %4
+; CHECK: %6 = bitcast [2 x float]* %f1 to <2 x float>*
+; CHECK: store <2 x float> %5, <2 x float>* %6
+
+%Block3 = type { [2 x float], i32 }
+@block3 = external addrspace(7) global %Block3
+
+; Function Attrs: nounwind
+define amdgpu_vs void @promote_load_from_store_aggr() #0 {
+  %i = alloca i32
+  %f1 = alloca [2 x float]
+  %1 = getelementptr %Block3, %Block3 addrspace(7)* @block3, i32 0, i32 1
+  %2 = load i32, i32 addrspace(7)* %1
+  store i32 %2, i32* %i
+  %3 = getelementptr %Block3, %Block3 addrspace(7)* @block3, i32 0, i32 0
+  %4 = load [2 x float], [2 x float] addrspace(7)* %3
+  store [2 x float] %4, [2 x float]* %f1
+  %5 = load i32, i32* %i
+  %6 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 %5
+  %7 = load float, float* %6
+  %8 = alloca <4 x float>
+  %9 = load <4 x float>, <4 x float>* %8
+  %10 = insertelement <4 x float> %9, float %7, i32 0
+  %11 = insertelement <4 x float> %10, float %7, i32 1
+  %12 = insertelement <4 x float> %11, float %7, i32 2
+  %13 = insertelement <4 x float> %12, float %7, i32 3
+  %14 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0
+  store <4 x float> %13, <4 x float> addrspace(6)* %14
+  ret void
+}
+
+; CHECK-LABEL: @promote_matrix_aggr
+
+%Block4 = type { <4 x float>, [2 x [4 x <4 x float>]] }
+@block4 = external addrspace(7) global %Block4
+
+; Function Attrs: nounwind
+define amdgpu_vs void @promote_matrix_aggr() #0 {
+  %f4 = alloca <4 x float>
+  %m4 = alloca [2 x [4 x <4 x float>]]
+  %1 = getelementptr %Block4, %Block4 addrspace(7)* @block4, i32 0, i32 0
+  %2 = load <4 x float>, <4 x float> addrspace(7)* %1
+  store <4 x float> %2, <4 x float>* %f4
+  %3 = load <4 x float>, <4 x float>* %f4
+  %4 = load <4 x float>, <4 x float>* %f4
+  %5 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 0
+  %6 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 1
+  %7 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 2
+  %8 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 3
+  %9 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 0
+  %10 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 1
+  %11 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 2
+  %12 = extractelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, i32 3
+  %13 = extractelement <4 x float> %3, i32 0
+  %14 = extractelement <4 x float> %3, i32 1
+  %15 = extractelement <4 x float> %3, i32 2
+  %16 = extractelement <4 x float> %3, i32 3
+  %17 = extractelement <4 x float> %4, i32 0
+  %18 = extractelement <4 x float> %4, i32 1
+  %19 = extractelement <4 x float> %4, i32 2
+  %20 = extractelement <4 x float> %4, i32 3
+  %21 = alloca <4 x float>
+  %22 = load <4 x float>, <4 x float>* %21
+  %23 = insertelement <4 x float> %22, float %5, i32 0
+  %24 = insertelement <4 x float> %23, float %6, i32 1
+  %25 = insertelement <4 x float> %24, float %7, i32 2
+  %26 = insertelement <4 x float> %25, float %8, i32 3
+  %27 = alloca <4 x float>
+  %28 = load <4 x float>, <4 x float>* %27
+  %29 = insertelement <4 x float> %28, float %9, i32 0
+  %30 = insertelement <4 x float> %29, float %10, i32 1
+  %31 = insertelement <4 x float> %30, float %11, i32 2
+  %32 = insertelement <4 x float> %31, float %12, i32 3
+  %33 = alloca <4 x float>
+  %34 = load <4 x float>, <4 x float>* %33
+  %35 = insertelement <4 x float> %34, float %13, i32 0
+  %36 = insertelement <4 x float> %35, float %14, i32 1
+  %37 = insertelement <4 x float> %36, float %15, i32 2
+  %38 = insertelement <4 x float> %37, float %16, i32 3
+  %39 = alloca <4 x float>
+  %40 = load <4 x float>, <4 x float>* %39
+  %41 = insertelement <4 x float> %40, float %17, i32 0
+  %42 = insertelement <4 x float> %41, float %18, i32 1
+  %43 = insertelement <4 x float> %42, float %19, i32 2
+  %44 = insertelement <4 x float> %43, float %20, i32 3
+  %45 = alloca [4 x <4 x float>]
+  %46 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 0
+  store <4 x float> %26, <4 x float>* %46
+  %47 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 1
+  store <4 x float> %32, <4 x float>* %47
+  %48 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 2
+  store <4 x float> %38, <4 x float>* %48
+  %49 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 3
+  store <4 x float> %44, <4 x float>* %49
+  %50 = load [4 x <4 x float>], [4 x <4 x float>]* %45
+  %51 = getelementptr [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4, i32 0, i32 0
+  store [4 x <4 x float>] %50, [4 x <4 x float>]* %51
+  %52 = load <4 x float>, <4 x float>* %f4
+  %53 = load <4 x float>, <4 x float>* %f4
+  %54 = extractelement <4 x float> %52, i32 0
+  %55 = extractelement <4 x float> %52, i32 1
+  %56 = extractelement <4 x float> %52, i32 2
+  %57 = extractelement <4 x float> %52, i32 3
+  %58 = extractelement <4 x float> %53, i32 0
+  %59 = extractelement <4 x float> %53, i32 1
+  %60 = extractelement <4 x float> %53, i32 2
+  %61 = extractelement <4 x float> %53, i32 3
+  %62 = extractelement <4 x float> zeroinitializer, i32 0
+  %63 = extractelement <4 x float> zeroinitializer, i32 1
+  %64 = extractelement <4 x float> zeroinitializer, i32 2
+  %65 = extractelement <4 x float> zeroinitializer, i32 3
+  %66 = extractelement <4 x float> zeroinitializer, i32 0
+  %67 = extractelement <4 x float> zeroinitializer, i32 1
+  %68 = extractelement <4 x float> zeroinitializer, i32 2
+  %69 = extractelement <4 x float> zeroinitializer, i32 3
+  %70 = alloca <4 x float>
+  %71 = load <4 x float>, <4 x float>* %70
+  %72 = insertelement <4 x float> %71, float %54, i32 0
+  %73 = insertelement <4 x float> %72, float %55, i32 1
+  %74 = insertelement <4 x float> %73, float %56, i32 2
+  %75 = insertelement <4 x float> %74, float %57, i32 3
+  %76 = alloca <4 x float>
+  %77 = load <4 x float>, <4 x float>* %76
+  %78 = insertelement <4 x float> %77, float %58, i32 0
+  %79 = insertelement <4 x float> %78, float %59, i32 1
+  %80 = insertelement <4 x float> %79, float %60, i32 2
+  %81 = insertelement <4 x float> %80, float %61, i32 3
+  %82 = alloca <4 x float>
+  %83 = load <4 x float>, <4 x float>* %82
+  %84 = insertelement <4 x float> %83, float %62, i32 0
+  %85 = insertelement <4 x float> %84, float %63, i32 1
+  %86 = insertelement <4 x float> %85, float %64, i32 2
+  %87 = insertelement <4 x float> %86, float %65, i32 3
+  %88 = alloca <4 x float>
+  %89 = load <4 x float>, <4 x float>* %88
+  %90 = insertelement <4 x float> %89, float %66, i32 0
+  %91 = insertelement <4 x float> %90, float %67, i32 1
+  %92 = insertelement <4 x float> %91, float %68, i32 2
+  %93 = insertelement <4 x float> %92, float %69, i32 3
+  %94 = alloca [4 x <4 x float>]
+  %95 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 0
+  store <4 x float> %75, <4 x float>* %95
+  %96 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 1
+  store <4 x float> %81, <4 x float>* %96
+  %97 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 2
+  store <4 x float> %87, <4 x float>* %97
+  %98 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 3
+  store <4 x float> %93, <4 x float>* %98
+  %99 = load [4 x <4 x float>], [4 x <4 x float>]* %94
+  %100 = getelementptr [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4, i32 0, i32 1
+  store [4 x <4 x float>] %99, [4 x <4 x float>]* %100
+  %101 = load [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4
+  %102 = getelementptr %Block4, %Block4 addrspace(7)* @block4, i32 0, i32 1
+  store [2 x [4 x <4 x float>]] %101, [2 x [4 x <4 x float>]] addrspace(7)* %102
+  %103 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0
+  store <4 x float> zeroinitializer, <4 x float> addrspace(6)* %103
+  ret void
+}
+
+; CHECK-LABEL: @promote_double_aggr
+; CHECK: %5 = insertelement <2 x double> undef, double %2, i32 0
+; CHECK: %6 = insertelement <2 x double> %5, double %4, i32 1
+; CHECK: %9 = bitcast [2 x double]* %s to <2 x double>*
+; CHECK: %10 = load <2 x double>, <2 x double>* %9
+; CHECK: %11 = extractelement <2 x double> %10, i32 1
+
+@1 = external addrspace(7) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
+@frag_color = external addrspace(6) global <4 x float>
+
+; Function Attrs: nounwind
+define amdgpu_ps void @promote_double_aggr() #0 {
+  %s = alloca [2 x double]
+  %1 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(7)* @1, i32 0, i32 0, i32 0
+  %2 = load double, double addrspace(7)* %1
+  %3 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(7)* @1, i32 0, i32 0, i32 1
+  %4 = load double, double addrspace(7)* %3
+  %5 = insertvalue [2 x double] undef, double %2, 0
+  %6 = insertvalue [2 x double] %5, double %4, 1
+  store [2 x double] %6, [2 x double]* %s
+  %7 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1
+  %8 = load double, double* %7
+  %9 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1
+  %10 = load double, double* %9
+  %11 = fadd double %8, %10
+  %12 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0
+  store double %11, double* %12
+  %13 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0
+  %14 = load double, double* %13
+  %15 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1
+  %16 = load double, double* %15
+  %17 = fadd double %14, %16
+  %18 = fptrunc double %17 to float
+  %19 = insertelement <4 x float> undef, float %18, i32 0
+  %20 = insertelement <4 x float> %19, float %18, i32 1
+  %21 = insertelement <4 x float> %20, float %18, i32 2
+  %22 = insertelement <4 x float> %21, float %18, i32 3
+  store <4 x float> %22, <4 x float> addrspace(6)* @frag_color
+  ret void
+}