Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -155,6 +155,7 @@ const AMDGPUSubtarget &ST = TM->getSubtarget(F); if (!ST.isPromoteAllocaEnabled()) return false; + AS = AMDGPU::getAMDGPUAS(*F.getParent()); FunctionType *FTy = F.getFunctionType(); @@ -395,11 +396,28 @@ return GEP->getOperand(2); } -// Not an instruction handled below to turn into a vector. +static Value* EVToVectorIndex(ExtractValueInst *EV, Module *Mod) { + // FIXME we only support simple cases + if (EV->getNumIndices() != 1) + return nullptr; + + return ConstantInt::get(Type::getInt32Ty(Mod->getContext()), EV->getIndices()[0]); +} + +static Value* IVToVectorIndex(InsertValueInst *IV, Module *Mod) { + // FIXME we only support simple cases + if (IV->getNumIndices() != 1) + return nullptr; + + return ConstantInt::get(Type::getInt32Ty(Mod->getContext()), IV->getIndices()[0]); +} + +// Check to ensure that we can vectorize a GEP user during transformation phase. Instructions in +// this check will be of the original alloca array aggregate element base type // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. -static bool canVectorizeInst(Instruction *Inst, User *User) { +static bool canVectorizeSimpleInst(Instruction *Inst, User *User) { switch (Inst->getOpcode()) { case Instruction::Load: case Instruction::BitCast: @@ -415,7 +433,261 @@ } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { +// Check that we can vectorize a load of an entire aggregate (at this stage arrays of 2,3, or 4 +// elements) +// Using a check like this enables us to give up in cases we can't handle rather than breaking later +// on during the transform phase +// We can handle simple cases of this where all the uses are ExtractValue or Stores. These are the +// observed cases from the front-ends so far +// TODO: Extend to more cases +static bool canVectorizeLoad(LoadInst *Inst, AllocaInst *Alloca, SmallSetVector &WorkList, + std::map &GEPVectorIdx) { + // Load the array scenario - we have to vectorize the uses as well + // Limit to extractvalue and store + for (User *LoadUser : Inst->users()) { + switch (cast(LoadUser)->getOpcode()) { + case Instruction::ExtractValue: + if (cast(LoadUser)->getNumIndices() != 1) return false; + break; + case Instruction::InsertValue: + if (cast(LoadUser)->getNumIndices() != 1) return false; + break; + case Instruction::Store: + // We can handle these + break; + default: + // More complicated - we need to reject vectorization in this case + return false; + } + } + // Add to the WorkList of items to transform - we use a SetVector here so we can remove any + // duplicates (which is a possibility if we are back-tracking from a store) + WorkList.insert(Inst); + return true; +} + +// Check that we can vectorize a store of an entire aggregate (at this stage arrays of 2,3, or 4 +// elements) +// Using a check like this enables us to give up in cases we can't handle rather than breaking later +// on during the transform phase +// This case handles only where we are storing to the alloca directly. We backtrack from the store +// to make sure that we only have insertvalues with one use, terminating with either a load or an +// undef. This is quite limited, but is an example of the cases encountered with current front-ends +// TODO: Extend to more cases +static bool canVectorizeStore(StoreInst *Inst, AllocaInst *Alloca, SmallSetVector &WorkList, + std::map &GEPVectorIdx) { + // Must be the stored pointer operand, not a stored value. + Value *Ptr = Inst->getPointerOperand(); + if (!(Ptr == Alloca)) + return false; + + // if (isa(Ptr)) { + // WorkList.insert(Inst); + // return true; + // } + + // We can handle insertvalues going back to either an alloca that we convert to a vector or + // to an undef + Value *Val = Inst->getValueOperand(); + while (isa(Val)) { + if (!Val->hasOneUse() || cast(Val)->getNumIndices() != 1) + // Can't deal with more than one use or more than 1 index (multi-dim array) + return false; + Val = cast(Val)->getAggregateOperand(); + } + if (isa(Val)) { + // We can handle the store and add it to the WorkList + WorkList.insert(Inst); + return true; + } + + if (isa(Val)) { + // We can handle this scenario + // Check that the load can be vectorized, this will be added to the WorkList as the starting + // point that will result in the store being handled as well + // We need to do this as there may be more than one use of the load + return canVectorizeLoad(cast(Val), Alloca, WorkList, GEPVectorIdx); + } + + // Not a Value we handle + return false; +} + +// Build a WorkList of instructions that require vectorization mods after transformation +// In the event that something is encountered that prevents vectorization return false +static bool canVectorizeInst(AllocaInst *Alloca, SmallSetVector &WorkList, + std::map &GEPVectorIdx) { + + for (User *AllocaUser : Alloca->users()) { + Instruction *Inst = cast(AllocaUser); + switch (Inst->getOpcode()) { + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(AllocaUser); + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeSimpleInst(cast(GEPUser), AllocaUser)) + return false; + + WorkList.insert(GEPUser); + } + break; + } + case Instruction::Load: { + if (!canVectorizeLoad(cast(Inst), Alloca, WorkList, GEPVectorIdx)) + return false; + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: { + WorkList.insert(Inst); + break; + } + case Instruction::Store: { + if (!canVectorizeStore(cast(Inst), Alloca, WorkList, GEPVectorIdx)) + return false; + break; + } + default: + // Can't handle this instruction - reject vectorization + return false; + } + } + return true; +} + +static void promoteLoad(LoadInst *LdInst, AllocaInst *Alloca, + VectorType *VectorTy, Type *VecPtrTy, + const std::map &GEPVectorIdx, + IRBuilder<> &Builder, Module *Mod) { + Value *Ptr = LdInst->getPointerOperand(); + if (isa(Ptr) && cast(Ptr)->getResultElementType()->isSingleValueType()) { + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + LdInst->replaceAllUsesWith(ExtractElement); + LdInst->eraseFromParent(); + } + else { + // We have to deal with the alloca being loaded as an array value + // Load into a vector and replace any extractvalues with extractelement + Type *AlignedVecPtrTy = VectorTy->getPointerTo(LdInst->getPointerAddressSpace()); + Value *BitCast = Builder.CreateBitCast(Ptr, AlignedVecPtrTy); + Value *VecValue = Builder.CreateLoad(BitCast); + + std::vector ToErase; + for (User *LoadUser : LdInst->users()) { + switch (cast(LoadUser)->getOpcode()) { + case Instruction::ExtractValue: { + ExtractValueInst *EVInst = cast(LoadUser); + IRBuilder<> EVBuilder(EVInst); + Value *ExtractElement = EVBuilder.CreateExtractElement(VecValue, EVToVectorIndex(EVInst, Mod)); + EVInst->replaceAllUsesWith(ExtractElement); + ToErase.push_back(EVInst); + break; + } + case Instruction::InsertValue: { + InsertValueInst *IVInst = cast(LoadUser); + IRBuilder<> IVBuilder(IVInst); + Value *InsertElement = IVBuilder.CreateInsertElement(VecValue, + IVInst->getInsertedValueOperand(), + IVToVectorIndex(IVInst, Mod)); + IVInst->replaceAllUsesWith(InsertElement); + ToErase.push_back(IVInst); + break; + } + case Instruction::Store: { + StoreInst *StInst = cast(LoadUser); + IRBuilder<> StBuilder(StInst); + // Make sure that the address spaces are the same + Type *VecPtrTyAddr = VectorTy->getPointerTo(StInst->getPointerAddressSpace()); + Value *StBitCast = StBuilder.CreateBitCast(StInst->getOperand(1), VecPtrTyAddr); + StBuilder.CreateStore(VecValue, StBitCast); + ToErase.push_back(StInst); + break; + } + default: + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + for (Instruction *Cand : ToErase) + Cand->eraseFromParent(); + LdInst->eraseFromParent(); + } +} + +static void promoteStore(StoreInst *StInst, AllocaInst *Alloca, + VectorType *VectorTy, Type *VecPtrTy, + const std::map &GEPVectorIdx, + IRBuilder<> &Builder, Module *Mod) { + Value *Ptr = StInst->getPointerOperand(); + if (isa(Ptr)) { + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + StInst->getValueOperand(), + Index); + Builder.CreateStore(NewVecValue, BitCast); + StInst->eraseFromParent(); + } + else { + // We've already verified that there is chain of 1 or more InsertValue instructions leading + // to this store from an original undef + // Work back up the chain and replace the arrays with a vector + Value *Val = StInst->getValueOperand(); + std::vector IVWorkList; + while (isa(Val)) { + IVWorkList.push_back(Val); + Val = cast(Val)->getAggregateOperand(); + } + assert(isa(Val)); + + // Traverse the list forwards - replace the InsertValue's with InsertElements + // Start with a new undef of a vector rather than an array (aggregate) + // First create the new vector based undef + std::vector ToErase; + Value *VecValue = UndefValue::get(VectorTy); + while (!IVWorkList.empty()) { + InsertValueInst *IVInst = cast(IVWorkList.back()); + IVWorkList.pop_back(); + + IRBuilder<> IVBuilder(IVInst); + VecValue = Builder.CreateInsertElement(VecValue, + IVInst->getInsertedValueOperand(), + IVToVectorIndex(IVInst, Mod)); + ToErase.push_back(IVInst); + } + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); + Builder.CreateStore(VecValue, BitCast); + StInst->eraseFromParent(); + // Erase the InsertValues in reverse order to prevent used value erase error + while (!ToErase.empty()) { + ToErase.back()->eraseFromParent(); + ToErase.pop_back(); + } + } +} + +// Attempt to turn an array aggregate alloca into vector operations +// There are 2 phases in this function +// Phase 1 - check that the transformation is possible and build a WorkList of Instructions to +// transform +// Phase 2 - perform the transformation from array aggregate uses to vector uses +// +// If the array aggregate or its uses are too complicated then return false (indicating that the +// promotion operation failed) +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, Module *Mod, AMDGPUAS AS) { ArrayType *AllocaTy = dyn_cast(Alloca->getAllocatedType()); DEBUG(dbgs() << "Alloca candidate for vectorization\n"); @@ -424,6 +696,7 @@ // are just being conservative for now. if (!AllocaTy || AllocaTy->getElementType()->isVectorTy() || + AllocaTy->getElementType()->isArrayTy() || AllocaTy->getNumElements() > 4 || AllocaTy->getNumElements() < 2) { DEBUG(dbgs() << " Cannot convert type to vector\n"); @@ -431,36 +704,16 @@ } std::map GEPVectorIdx; - std::vector WorkList; - for (User *AllocaUser : Alloca->users()) { - GetElementPtrInst *GEP = dyn_cast(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(cast(AllocaUser), Alloca)) - return false; - - WorkList.push_back(AllocaUser); - continue; - } - - Value *Index = GEPToVectorIndex(GEP); - - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); - return false; - } - - GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast(GEPUser), AllocaUser)) - return false; - - WorkList.push_back(GEPUser); - } - } + // Use a SetVector as there is a (valid) possibility of duplicates. Using a SetVector ensures that + // we handle in order but allow for duplication of work items + SmallSetVector WorkList; + if (!canVectorizeInst(Alloca, WorkList, GEPVectorIdx)) + // Something we don't/can't handle is present + return false; + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); @@ -470,29 +723,11 @@ IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); - Value *Ptr = Inst->getOperand(0); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - - Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); - Inst->replaceAllUsesWith(ExtractElement); - Inst->eraseFromParent(); + promoteLoad(cast(Inst), Alloca, VectorTy, VecPtrTy, GEPVectorIdx, Builder, Mod); break; } case Instruction::Store: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); - - Value *Ptr = Inst->getOperand(1); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - Inst->getOperand(0), - Index); - Builder.CreateStore(NewVecValue, BitCast); - Inst->eraseFromParent(); + promoteStore(cast(Inst), Alloca, VectorTy, VecPtrTy, GEPVectorIdx, Builder, Mod); break; } case Instruction::BitCast: @@ -675,8 +910,8 @@ DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + if (tryPromoteAllocaToVector(&I, Mod, AS)) { + DEBUG(dbgs() << " alloca has been vectorized.\n"); return; } Index: test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -0,0 +1,264 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s + +; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly + +; CHECK-LABEL: @promote_1d_aggr + +%Block = type { [1 x float], i32 } +%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] } + +@block = external addrspace(7) global %Block +@0 = external addrspace(6) global %gl_PerVertex + +; Function Attrs: nounwind +define amdgpu_vs void @promote_1d_aggr() #0 { + %i = alloca i32 + %f1 = alloca [1 x float] + %1 = getelementptr %Block, %Block addrspace(7)* @block, i32 0, i32 1 + %2 = load i32, i32 addrspace(7)* %1 + store i32 %2, i32* %i + %3 = getelementptr %Block, %Block addrspace(7)* @block, i32 0, i32 0 + %4 = load [1 x float], [1 x float] addrspace(7)* %3 + store [1 x float] %4, [1 x float]* %f1 + %5 = load i32, i32* %i + %6 = getelementptr [1 x float], [1 x float]* %f1, i32 0, i32 %5 + %7 = load float, float* %6 + %8 = alloca <4 x float> + %9 = load <4 x float>, <4 x float>* %8 + %10 = insertelement <4 x float> %9, float %7, i32 0 + %11 = insertelement <4 x float> %10, float %7, i32 1 + %12 = insertelement <4 x float> %11, float %7, i32 2 + %13 = insertelement <4 x float> %12, float %7, i32 3 + %14 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0 + store <4 x float> %13, <4 x float> addrspace(6)* %14 + ret void +} + + +; CHECK-LABEL: @promote_store_aggr +; CHECK: store <2 x float> %14, <2 x float> addrspace(7)* %16 + +%Block2 = type { i32, [2 x float] } +@block2 = external addrspace(7) global %Block2 + +; Function Attrs: nounwind +define amdgpu_vs void @promote_store_aggr() #0 { + %i = alloca i32 + %f1 = alloca [2 x float] + %1 = getelementptr %Block2, %Block2 addrspace(7)* @block2, i32 0, i32 0 + %2 = load i32, i32 addrspace(7)* %1 + store i32 %2, i32* %i + %3 = load i32, i32* %i + %4 = sitofp i32 %3 to float + %5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 0 + store float %4, float* %5 + %6 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 1 + store float 2.000000e+00, float* %6 + %7 = load [2 x float], [2 x float]* %f1 + %8 = getelementptr %Block2, %Block2 addrspace(7)* @block2, i32 0, i32 1 + store [2 x float] %7, [2 x float] addrspace(7)* %8 + %9 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0 + store <4 x float> , <4 x float> addrspace(6)* %9 + ret void +} + +; CHECK-LABEL: @promote_load_from_store_aggr +; CHECK: %5 = load <2 x float>, <2 x float> addrspace(7)* %4 +; CHECK: %6 = bitcast [2 x float]* %f1 to <2 x float>* +; CHECK: store <2 x float> %5, <2 x float>* %6 + +%Block3 = type { [2 x float], i32 } +@block3 = external addrspace(7) global %Block3 + +; Function Attrs: nounwind +define amdgpu_vs void @promote_load_from_store_aggr() #0 { + %i = alloca i32 + %f1 = alloca [2 x float] + %1 = getelementptr %Block3, %Block3 addrspace(7)* @block3, i32 0, i32 1 + %2 = load i32, i32 addrspace(7)* %1 + store i32 %2, i32* %i + %3 = getelementptr %Block3, %Block3 addrspace(7)* @block3, i32 0, i32 0 + %4 = load [2 x float], [2 x float] addrspace(7)* %3 + store [2 x float] %4, [2 x float]* %f1 + %5 = load i32, i32* %i + %6 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 %5 + %7 = load float, float* %6 + %8 = alloca <4 x float> + %9 = load <4 x float>, <4 x float>* %8 + %10 = insertelement <4 x float> %9, float %7, i32 0 + %11 = insertelement <4 x float> %10, float %7, i32 1 + %12 = insertelement <4 x float> %11, float %7, i32 2 + %13 = insertelement <4 x float> %12, float %7, i32 3 + %14 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0 + store <4 x float> %13, <4 x float> addrspace(6)* %14 + ret void +} + +; CHECK-LABEL: @promote_matrix_aggr + +%Block4 = type { <4 x float>, [2 x [4 x <4 x float>]] } +@block4 = external addrspace(7) global %Block4 + +; Function Attrs: nounwind +define amdgpu_vs void @promote_matrix_aggr() #0 { + %f4 = alloca <4 x float> + %m4 = alloca [2 x [4 x <4 x float>]] + %1 = getelementptr %Block4, %Block4 addrspace(7)* @block4, i32 0, i32 0 + %2 = load <4 x float>, <4 x float> addrspace(7)* %1 + store <4 x float> %2, <4 x float>* %f4 + %3 = load <4 x float>, <4 x float>* %f4 + %4 = load <4 x float>, <4 x float>* %f4 + %5 = extractelement <4 x float> , i32 0 + %6 = extractelement <4 x float> , i32 1 + %7 = extractelement <4 x float> , i32 2 + %8 = extractelement <4 x float> , i32 3 + %9 = extractelement <4 x float> , i32 0 + %10 = extractelement <4 x float> , i32 1 + %11 = extractelement <4 x float> , i32 2 + %12 = extractelement <4 x float> , i32 3 + %13 = extractelement <4 x float> %3, i32 0 + %14 = extractelement <4 x float> %3, i32 1 + %15 = extractelement <4 x float> %3, i32 2 + %16 = extractelement <4 x float> %3, i32 3 + %17 = extractelement <4 x float> %4, i32 0 + %18 = extractelement <4 x float> %4, i32 1 + %19 = extractelement <4 x float> %4, i32 2 + %20 = extractelement <4 x float> %4, i32 3 + %21 = alloca <4 x float> + %22 = load <4 x float>, <4 x float>* %21 + %23 = insertelement <4 x float> %22, float %5, i32 0 + %24 = insertelement <4 x float> %23, float %6, i32 1 + %25 = insertelement <4 x float> %24, float %7, i32 2 + %26 = insertelement <4 x float> %25, float %8, i32 3 + %27 = alloca <4 x float> + %28 = load <4 x float>, <4 x float>* %27 + %29 = insertelement <4 x float> %28, float %9, i32 0 + %30 = insertelement <4 x float> %29, float %10, i32 1 + %31 = insertelement <4 x float> %30, float %11, i32 2 + %32 = insertelement <4 x float> %31, float %12, i32 3 + %33 = alloca <4 x float> + %34 = load <4 x float>, <4 x float>* %33 + %35 = insertelement <4 x float> %34, float %13, i32 0 + %36 = insertelement <4 x float> %35, float %14, i32 1 + %37 = insertelement <4 x float> %36, float %15, i32 2 + %38 = insertelement <4 x float> %37, float %16, i32 3 + %39 = alloca <4 x float> + %40 = load <4 x float>, <4 x float>* %39 + %41 = insertelement <4 x float> %40, float %17, i32 0 + %42 = insertelement <4 x float> %41, float %18, i32 1 + %43 = insertelement <4 x float> %42, float %19, i32 2 + %44 = insertelement <4 x float> %43, float %20, i32 3 + %45 = alloca [4 x <4 x float>] + %46 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 0 + store <4 x float> %26, <4 x float>* %46 + %47 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 1 + store <4 x float> %32, <4 x float>* %47 + %48 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 2 + store <4 x float> %38, <4 x float>* %48 + %49 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %45, i32 0, i32 3 + store <4 x float> %44, <4 x float>* %49 + %50 = load [4 x <4 x float>], [4 x <4 x float>]* %45 + %51 = getelementptr [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4, i32 0, i32 0 + store [4 x <4 x float>] %50, [4 x <4 x float>]* %51 + %52 = load <4 x float>, <4 x float>* %f4 + %53 = load <4 x float>, <4 x float>* %f4 + %54 = extractelement <4 x float> %52, i32 0 + %55 = extractelement <4 x float> %52, i32 1 + %56 = extractelement <4 x float> %52, i32 2 + %57 = extractelement <4 x float> %52, i32 3 + %58 = extractelement <4 x float> %53, i32 0 + %59 = extractelement <4 x float> %53, i32 1 + %60 = extractelement <4 x float> %53, i32 2 + %61 = extractelement <4 x float> %53, i32 3 + %62 = extractelement <4 x float> zeroinitializer, i32 0 + %63 = extractelement <4 x float> zeroinitializer, i32 1 + %64 = extractelement <4 x float> zeroinitializer, i32 2 + %65 = extractelement <4 x float> zeroinitializer, i32 3 + %66 = extractelement <4 x float> zeroinitializer, i32 0 + %67 = extractelement <4 x float> zeroinitializer, i32 1 + %68 = extractelement <4 x float> zeroinitializer, i32 2 + %69 = extractelement <4 x float> zeroinitializer, i32 3 + %70 = alloca <4 x float> + %71 = load <4 x float>, <4 x float>* %70 + %72 = insertelement <4 x float> %71, float %54, i32 0 + %73 = insertelement <4 x float> %72, float %55, i32 1 + %74 = insertelement <4 x float> %73, float %56, i32 2 + %75 = insertelement <4 x float> %74, float %57, i32 3 + %76 = alloca <4 x float> + %77 = load <4 x float>, <4 x float>* %76 + %78 = insertelement <4 x float> %77, float %58, i32 0 + %79 = insertelement <4 x float> %78, float %59, i32 1 + %80 = insertelement <4 x float> %79, float %60, i32 2 + %81 = insertelement <4 x float> %80, float %61, i32 3 + %82 = alloca <4 x float> + %83 = load <4 x float>, <4 x float>* %82 + %84 = insertelement <4 x float> %83, float %62, i32 0 + %85 = insertelement <4 x float> %84, float %63, i32 1 + %86 = insertelement <4 x float> %85, float %64, i32 2 + %87 = insertelement <4 x float> %86, float %65, i32 3 + %88 = alloca <4 x float> + %89 = load <4 x float>, <4 x float>* %88 + %90 = insertelement <4 x float> %89, float %66, i32 0 + %91 = insertelement <4 x float> %90, float %67, i32 1 + %92 = insertelement <4 x float> %91, float %68, i32 2 + %93 = insertelement <4 x float> %92, float %69, i32 3 + %94 = alloca [4 x <4 x float>] + %95 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 0 + store <4 x float> %75, <4 x float>* %95 + %96 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 1 + store <4 x float> %81, <4 x float>* %96 + %97 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 2 + store <4 x float> %87, <4 x float>* %97 + %98 = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* %94, i32 0, i32 3 + store <4 x float> %93, <4 x float>* %98 + %99 = load [4 x <4 x float>], [4 x <4 x float>]* %94 + %100 = getelementptr [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4, i32 0, i32 1 + store [4 x <4 x float>] %99, [4 x <4 x float>]* %100 + %101 = load [2 x [4 x <4 x float>]], [2 x [4 x <4 x float>]]* %m4 + %102 = getelementptr %Block4, %Block4 addrspace(7)* @block4, i32 0, i32 1 + store [2 x [4 x <4 x float>]] %101, [2 x [4 x <4 x float>]] addrspace(7)* %102 + %103 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(6)* @0, i32 0, i32 0 + store <4 x float> zeroinitializer, <4 x float> addrspace(6)* %103 + ret void +} + +; CHECK-LABEL: @promote_double_aggr +; CHECK: %5 = insertelement <2 x double> undef, double %2, i32 0 +; CHECK: %6 = insertelement <2 x double> %5, double %4, i32 1 +; CHECK: %9 = bitcast [2 x double]* %s to <2 x double>* +; CHECK: %10 = load <2 x double>, <2 x double>* %9 +; CHECK: %11 = extractelement <2 x double> %10, i32 1 + +@1 = external addrspace(7) global { [4 x double], <2 x double>, <3 x double>, <4 x double> } +@frag_color = external addrspace(6) global <4 x float> + +; Function Attrs: nounwind +define amdgpu_ps void @promote_double_aggr() #0 { + %s = alloca [2 x double] + %1 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(7)* @1, i32 0, i32 0, i32 0 + %2 = load double, double addrspace(7)* %1 + %3 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(7)* @1, i32 0, i32 0, i32 1 + %4 = load double, double addrspace(7)* %3 + %5 = insertvalue [2 x double] undef, double %2, 0 + %6 = insertvalue [2 x double] %5, double %4, 1 + store [2 x double] %6, [2 x double]* %s + %7 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 + %8 = load double, double* %7 + %9 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 + %10 = load double, double* %9 + %11 = fadd double %8, %10 + %12 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0 + store double %11, double* %12 + %13 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0 + %14 = load double, double* %13 + %15 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1 + %16 = load double, double* %15 + %17 = fadd double %14, %16 + %18 = fptrunc double %17 to float + %19 = insertelement <4 x float> undef, float %18, i32 0 + %20 = insertelement <4 x float> %19, float %18, i32 1 + %21 = insertelement <4 x float> %20, float %18, i32 2 + %22 = insertelement <4 x float> %21, float %18, i32 3 + store <4 x float> %22, <4 x float> addrspace(6)* @frag_color + ret void +}