diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -28,16 +28,21 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -64,6 +69,7 @@ class AMDGPUPromoteAllocaImpl { private: const TargetMachine &TM; + const DominatorTree &DT; Module *Mod = nullptr; const DataLayout *DL = nullptr; @@ -99,7 +105,8 @@ bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); public: - AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) { + AMDGPUPromoteAllocaImpl(TargetMachine &TM, const DominatorTree &DT) + : TM(TM), DT(DT) { const Triple &TT = TM.getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; IsAMDHSA = TT.getOS() == Triple::AMDHSA; @@ -113,13 +120,17 @@ public: static char ID; - AMDGPUPromoteAlloca() : FunctionPass(ID) {} + AMDGPUPromoteAlloca() : FunctionPass(ID) { + initializeAMDGPUPromoteAllocaPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; if (auto *TPC = getAnalysisIfAvailable()) - return AMDGPUPromoteAllocaImpl(TPC->getTM()) + return AMDGPUPromoteAllocaImpl( + TPC->getTM(), + getAnalysis().getDomTree()) .run(F, /*PromoteToLDS*/ true); return false; } @@ -127,6 +138,8 @@ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); } @@ -136,13 +149,17 @@ public: static char ID; - AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} + AMDGPUPromoteAllocaToVector() : FunctionPass(ID) { + initializeAMDGPUPromoteAllocaToVectorPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; if (auto *TPC = getAnalysisIfAvailable()) - return AMDGPUPromoteAllocaImpl(TPC->getTM()) + return AMDGPUPromoteAllocaImpl( + TPC->getTM(), + getAnalysis().getDomTree()) .run(F, /*PromoteToLDS*/ false); return false; } @@ -152,6 +169,8 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); } @@ -183,18 +202,24 @@ // Move LDS uses from functions to kernels before promote alloca for accurate // estimation of LDS available INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) -INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", - "AMDGPU promote alloca to vector", false, false) +INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", + "AMDGPU promote alloca to vector", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", + "AMDGPU promote alloca to vector", false, false) char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true); + bool Changed = + AMDGPUPromoteAllocaImpl(TM, AM.getResult(F)) + .run(F, /*PromoteToLDS*/ true); if (Changed) { PreservedAnalyses PA; PA.preserveSet(); @@ -205,7 +230,9 @@ PreservedAnalyses AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false); + bool Changed = + AMDGPUPromoteAllocaImpl(TM, AM.getResult(F)) + .run(F, /*PromoteToLDS*/ false); if (Changed) { PreservedAnalyses PA; PA.preserveSet(); @@ -253,6 +280,10 @@ Changed = true; } + // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains + // dangling pointers. If we want to reuse it past this point, the loop above + // would need to be updated to remove successfully promoted allocas. + return Changed; } @@ -269,6 +300,10 @@ using namespace PatternMatch; // For now we only care about non-volatile memsets that affect the whole type // (start at index 0 and fill the whole alloca). + // + // TODO: Now that we moved to PromoteAlloca we could handle any memsets + // (except maybe volatile ones?) - we just need to use shufflevector if it + // only affects a subset of the vector. const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType()); return I->getOperand(0) == AI && match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); @@ -365,6 +400,7 @@ std::map GEPVectorIdx; SmallVector WorkList; + SmallVector UsersToRemove; SmallVector DeferredInsts; SmallVector Uses; DenseMap TransferInfo; @@ -393,12 +429,18 @@ return RejectUser(Inst, "pointer is being stored"); Type *AccessTy = getLoadStoreType(Inst); + if (AccessTy->isAggregateType()) + return RejectUser(Inst, "unsupported load/store as aggregate"); + assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); + Ptr = Ptr->stripPointerCasts(); - // Alloca already accessed as vector, leave alone. + // Alloca already accessed as vector. if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) + DL->getTypeStoreSize(AccessTy)) { + WorkList.push_back(Inst); continue; + } // Check that this is a simple access of a vector element. bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() @@ -416,6 +458,7 @@ // Look through bitcasts. for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); continue; } @@ -429,6 +472,7 @@ GEPVectorIdx[GEP] = Index; for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); continue; } @@ -481,13 +525,17 @@ } // Ignore assume-like intrinsics and comparisons used in assumes. - if (isAssumeLikeIntrinsic(Inst)) + if (isAssumeLikeIntrinsic(Inst)) { + UsersToRemove.push_back(Inst); continue; + } if (isa(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast(U)); - })) + })) { + UsersToRemove.push_back(Inst); continue; + } return RejectUser(Inst, "unhandled alloca user"); } @@ -506,42 +554,80 @@ LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); + // Sort the worklist by dominance so we can use SSAUpdater. + sort(WorkList, + [&](Instruction *A, Instruction *B) { return DT.dominates(A, B); }); + + SSAUpdater Updater; + Updater.Initialize(VectorTy, "promotealloca"); + + // alloca is uninitialized memory. Imitate that by making the first value a + // poison value. + Updater.AddAvailableValue(Alloca.getParent(), PoisonValue::get(VectorTy)); + + const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); for (Instruction *Inst : WorkList) { - IRBuilder<> Builder(Inst); + // Note: we use InstSimplifyFolder because it can leverage the DataLayout + // to do more folding, especially in the case of vector splats. + IRBuilder Builder(Inst->getContext(), + InstSimplifyFolder(*DL)); + Builder.SetInsertPoint(Inst); + switch (Inst->getOpcode()) { case Instruction::Load: { - Value *Ptr = cast(Inst)->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign()); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent()); + Value *Index = calculateVectorIndex( + cast(Inst)->getPointerOperand(), GEPVectorIdx); + + // loading the full vector + if (DL->getTypeStoreSize(Inst->getType()) == VecStoreSize) { + assert(cast(Index)->isZeroValue()); + + Value *NewVal = Builder.CreateBitCast(Vec, Inst->getType()); + Inst->replaceAllUsesWith(NewVal); + Inst->eraseFromParent(); + break; + } + + Value *ExtractElement = Builder.CreateExtractElement(Vec, Index); if (Inst->getType() != VecEltTy) ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); + Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); break; } case Instruction::Store: { StoreInst *SI = cast(Inst); - Value *Ptr = SI->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign()); + Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent()); + Value *Index = + calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); Value *Elt = SI->getValueOperand(); + + // storing the full vector + if (DL->getTypeStoreSize(Elt->getType()) == VecStoreSize) { + assert(cast(Index)->isZeroValue()); + Updater.AddAvailableValue(Inst->getParent(), + Builder.CreateBitCast(Elt, VectorTy)); + Inst->eraseFromParent(); + break; + } + if (Elt->getType() != VecEltTy) Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); - Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign()); + Value *NewVec = Builder.CreateInsertElement(Vec, Elt, Index); + + Updater.AddAvailableValue(Inst->getParent(), NewVec); Inst->eraseFromParent(); break; } case Instruction::Call: { if (const MemTransferInst *MTI = dyn_cast(Inst)) { + // TODO: This is probably incorrect, promote-alloca-array-aggregate has + // an extra shuffle if the worklist is sorted by dominance, but it + // doesn't have it if it's not sorted. + ConstantInt *Length = cast(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; MemTransferInfo *TI = &TransferInfo[cast(Inst)]; @@ -556,19 +642,25 @@ Mask.push_back(Idx); } } - Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign()); - Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask); - Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign()); + Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent()); + Value *NewVec = Builder.CreateShuffleVector(Vec, Mask); + + Updater.AddAvailableValue(Inst->getParent(), NewVec); Inst->eraseFromParent(); } else if (MemSetInst *MSI = dyn_cast(Inst)) { - // Ensure the length parameter of the memsets matches the new vector - // type's. In general, the type size shouldn't change so this is a - // no-op, but it's better to be safe. - MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy))); + Value *Elt = MSI->getOperand(1); + if (DL->getTypeStoreSize(VecEltTy) > 1) { + Value *EltBytes = + Builder.CreateVectorSplat(DL->getTypeStoreSize(VecEltTy), Elt); + Elt = Builder.CreateBitCast(EltBytes, VecEltTy); + } + + Value *Splat = + Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + + Updater.AddAvailableValue(Inst->getParent(), Splat); + MSI->eraseFromParent(); } else { llvm_unreachable("Unsupported call when promoting alloca to vector"); } @@ -580,6 +672,17 @@ } } + // Delete all the users that are known to be removeable. + for (Instruction *I : reverse(UsersToRemove)) { + I->dropDroppableUses(); + assert(I->use_empty()); + I->eraseFromParent(); + } + + // Alloca should now be dead too. + assert(Alloca.use_empty()); + Alloca.eraseFromParent(); + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -13,38 +13,21 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s16, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: v_writelane_b32 v43, s16, 0 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s30, 0 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s31, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -58,10 +41,10 @@ ; GCN-NEXT: v_readlane_b32 s30, v42, 0 ; GCN-NEXT: v_readlane_b32 s4, v43, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xd000 +; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -192,8 +192,8 @@ ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces ; GCN-O1-NEXT: Expand Atomic instructions -; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: SROA ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis @@ -467,8 +467,8 @@ ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces ; GCN-O1-OPTS-NEXT: Expand Atomic instructions -; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: SROA ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis @@ -765,8 +765,8 @@ ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: Expand Atomic instructions -; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Scalar Evolution Analysis @@ -1074,8 +1074,8 @@ ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: Expand Atomic instructions -; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Scalar Evolution Analysis diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but @@ -16,19 +16,16 @@ define amdgpu_vs void @promote_1d_aggr() #0 { ; CHECK-LABEL: @promote_1d_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4 -; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0 +; CHECK-NEXT: [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]] ; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4 -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 ; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3 @@ -60,22 +57,12 @@ define amdgpu_vs void @promote_store_aggr() #0 { ; CHECK-LABEL: @promote_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0 -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1 -; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float +; CHECK-NEXT: [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0 +; CHECK-NEXT: [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1 ; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1 -; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4 +; CHECK-NEXT: store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4 ; CHECK-NEXT: store <4 x float> , ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; @@ -100,23 +87,18 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-LABEL: @promote_load_from_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4 -; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]] -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0 -; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2 -; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3 +; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[FOO3_FCA_0_EXTRACT]], i32 0 +; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]] +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 +; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3 ; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; @@ -142,22 +124,7 @@ define amdgpu_vs void @promote_memmove_aggr() #0 { ; CHECK-LABEL: @promote_memmove_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -174,24 +141,12 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]] -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> , float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0 +; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -213,22 +168,7 @@ define amdgpu_vs void @promote_memcpy_identity_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_identity_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -248,8 +188,26 @@ ; CHECK-LABEL: @promote_memcpy_two_aggrs( ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) ; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] @@ -283,7 +241,16 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 { ; CHECK-LABEL: @promote_memcpy_p1p5_aggr( ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] @@ -305,21 +272,12 @@ define amdgpu_vs void @promote_memcpy_inline_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_inline_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]] -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0 -; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0 +; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -347,30 +305,16 @@ define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-LABEL: @promote_double_aggr( -; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8 ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 ; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8 ; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0 ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 -; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 -; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1 -; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1 -; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0 +; CHECK-NEXT: [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1 +; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]] +; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]] ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float ; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0 ; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1 @@ -410,21 +354,6 @@ define amdgpu_kernel void @alloca_struct() #0 { ; CHECK-LABEL: @alloca_struct( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -6,7 +6,7 @@ @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) { -; IR: alloca [10 x i32] +; IR-NOT: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16 ; ASM-NOT: .amdgpu_lds diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -1,19 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s ; Checks that memsets don't block PromoteAlloca. -; Note: memsets are just updated with the new type size. They are not eliminated which means -; the original alloca also stay. This puts a bit more load on SROA. -; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with -; e.g. ConstantAggregate. - define amdgpu_kernel void @memset_all_zero(i64 %val) { ; CHECK-LABEL: @memset_all_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 ; CHECK-NEXT: ret void ; entry: @@ -30,8 +24,7 @@ ; CHECK-LABEL: @memset_all_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> , i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 ; CHECK-NEXT: ret void ; entry: @@ -47,11 +40,9 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) { ; CHECK-LABEL: @memset_volatile_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) -; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -64,11 +55,9 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) { ; CHECK-LABEL: @memset_badsize_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) -; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -81,8 +70,10 @@ define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) { ; CHECK-LABEL: @memset_offset_ptr_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true) +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1 +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll @@ -4,15 +4,10 @@ define i64 @test_pointer_array(i64 %v) { ; OPT-LABEL: @test_pointer_array( ; OPT-NEXT: entry: -; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5) -; OPT-NEXT: [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr -; OPT-NEXT: [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0 -; OPT-NEXT: store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0 -; OPT-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 -; OPT-NEXT: ret i64 [[TMP5]] +; OPT-NEXT: [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr +; OPT-NEXT: [[TMP1:%.*]] = insertelement <3 x ptr> poison, ptr [[TMP0]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; OPT-NEXT: ret i64 [[TMP2]] ; entry: %a = alloca [3 x ptr], align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 @@ -11,11 +11,8 @@ ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, ; GCN: store_dword v{{.+}}, [[RES]] -; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x float> , ptr addrspace(5) %alloca, align 4 -; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x float> %0, i32 %sel2 -; OPT: store float %1, ptr addrspace(1) %out, align 4 +; OPT: %0 = extractelement <4 x float> , i32 %sel2 +; OPT: store float %0, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -46,12 +43,8 @@ ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] ; GCN: store_dwordx4 v{{.+}}, -; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2 -; OPT: store <4 x float> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4 -; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4 +; OPT: %0 = insertelement <4 x float> poison, float 1.000000e+00, i32 %sel2 +; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -77,11 +70,8 @@ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x half> , ptr addrspace(5) %alloca, align 2 -; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x half> %0, i32 %sel2 -; OPT: store half %1, ptr addrspace(1) %out, align 2 +; OPT: %0 = extractelement <4 x half> , i32 %sel2 +; OPT: store half %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -105,12 +95,8 @@ ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2 -; OPT: store <4 x half> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2 -; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2 +; OPT: %0 = insertelement <4 x half> poison, half 0xH3C00, i32 %sel2 +; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -136,11 +122,8 @@ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x i16> , ptr addrspace(5) %alloca, align 2 -; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2 -; OPT: store i16 %1, ptr addrspace(1) %out, align 2 +; OPT: %0 = extractelement <4 x i16> , i32 %sel2 +; OPT: store i16 %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -164,12 +147,8 @@ ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2 -; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2 -; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2 +; OPT: %0 = insertelement <4 x i16> poison, i16 1, i32 %sel2 +; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -193,8 +172,7 @@ ; GCN-NOT: buffer_ ; GCN: v_mov_b32_e32 v1, 0 -; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5) -; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8 +; OPT: ret i64 undef define i64 @ptr_alloca_bitcast() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll --- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll @@ -12,8 +12,6 @@ ; FUNC-LABEL: @private_memory ; LOOP-NOT: alloca -; LOOP: loop.header: -; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header ; FULL-UNROLL: alloca ; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, ptr addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s -; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s +; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s target datalayout = "A5" @@ -75,8 +76,7 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_float( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10 +; OPT: %0 = insertelement <6 x float> poison, float %tmp71, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> ; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 @@ -84,24 +84,13 @@ ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float: ; GCN-ALLOCA: buffer_store_dword -; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16 -; GCN-PROMOTE-COUNT-6: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask ; GCN: s_cbranch ; GCN-ALLOCA: buffer_load_dword -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask - ; GCN-PROMOTE: ScratchSize: 0 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) { @@ -147,8 +136,7 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_double( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10 +; OPT: %0 = insertelement <6 x double> poison, double %tmp71, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> ; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20 @@ -208,8 +196,7 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_i64( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9 +; OPT: %0 = insertelement <6 x i64> poison, i64 %tmp6, i32 %tmp9 ; OPT: .preheader: ; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18 @@ -272,7 +259,7 @@ ; OPT: store i32 %0, ptr addrspace(1) %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume: -; GCN-COUNT-4: buffer_store_dword +; GCN-COUNT: buffer_store_dword define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) { entry: