diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -28,7 +28,10 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InstSimplifyFolder.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" @@ -38,6 +41,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -45,20 +49,20 @@ namespace { -static cl::opt DisablePromoteAllocaToVector( - "disable-promote-alloca-to-vector", - cl::desc("Disable promote alloca to vector"), - cl::init(false)); +static cl::opt + DisablePromoteAllocaToVector("disable-promote-alloca-to-vector", + cl::desc("Disable promote alloca to vector"), + cl::init(false)); -static cl::opt DisablePromoteAllocaToLDS( - "disable-promote-alloca-to-lds", - cl::desc("Disable promote alloca to LDS"), - cl::init(false)); +static cl::opt + DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds", + cl::desc("Disable promote alloca to LDS"), + cl::init(false)); static cl::opt PromoteAllocaToVectorLimit( - "amdgpu-promote-alloca-to-vector-limit", - cl::desc("Maximum byte size to consider promote alloca to vector"), - cl::init(0)); + "amdgpu-promote-alloca-to-vector-limit", + cl::desc("Maximum byte size to consider promote alloca to vector"), + cl::init(0)); // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { @@ -80,17 +84,16 @@ /// BaseAlloca is the alloca root the search started from. /// Val may be that alloca or a recursive user of it. - bool collectUsesWithPtrTypes(Value *BaseAlloca, - Value *Val, - std::vector &WorkList) const; + bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, + std::vector &WorkList) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). /// Returns true if both operands are derived from the same alloca. Val should /// be the same value as one of the input operands of UseInst. bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, - Instruction *UseInst, - int OpIdx0, int OpIdx1) const; + Instruction *UseInst, int OpIdx0, + int OpIdx1) const; /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); @@ -253,6 +256,10 @@ Changed = true; } + // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains + // dangling pointers. If we want to reuse it past this point, the loop above + // would need to be updated to remove successfully promoted allocas. + return Changed; } @@ -269,6 +276,10 @@ using namespace PatternMatch; // For now we only care about non-volatile memsets that affect the whole type // (start at index 0 and fill the whole alloca). + // + // TODO: Now that we moved to PromoteAlloca we could handle any memsets + // (except maybe volatile ones?) - we just need to use shufflevector if it + // only affects a subset of the vector. const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType()); return I->getOperand(0) == AI && match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); @@ -319,6 +330,200 @@ return ConstantInt::get(GEP->getContext(), Quot); } +/// Promotes a single user of the alloca to a vector form. +/// +/// \param Inst Instruction to be promoted. +/// \param DL Module Data Layout. +/// \param VectorTy Vectorized Type. +/// \param VecStoreSize Size of \p VectorTy in bytes. +/// \param ElementSize Size of \p VectorTy element type in bytes. +/// \param TransferInfo MemTransferInst info map. +/// \param GEPVectorIdx GEP -> VectorIdx cache. +/// \param CurVal Current value of the vector (e.g. last stored value) +/// \param[out] DeferredLoads \p Inst is added to this vector if it can't +/// be promoted now. This happens when promoting requires \p +/// CurVal, but \p CurVal is nullptr. +/// \return the stored value if \p Inst would have written to the alloca, or +/// nullptr otherwise. +static Value *promoteAllocaUserToVector( + Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, + unsigned VecStoreSize, unsigned ElementSize, + DenseMap &TransferInfo, + std::map &GEPVectorIdx, Value *CurVal, + SmallVectorImpl &DeferredLoads) { + // Note: we use InstSimplifyFolder because it can leverage the DataLayout + // to do more folding, especially in the case of vector splats. + IRBuilder Builder(Inst->getContext(), + InstSimplifyFolder(DL)); + Builder.SetInsertPoint(Inst); + + const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { + if (CurVal) + return CurVal; + + // If the current value is not known, insert a dummy load and lower it on + // the second pass. + LoadInst *Dummy = + Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), + "promotealloca.dummyload"); + DeferredLoads.push_back(Dummy); + return Dummy; + }; + + const auto CreateTempPtrIntCast = + [&Builder, VecStoreSize](Value *Val, Type *PtrTy) -> Value * { + const unsigned TempIntSize = (VecStoreSize * 8); + if (!PtrTy->isVectorTy()) + return Builder.CreateBitOrPointerCast(Val, + Builder.getIntNTy(TempIntSize)); + const unsigned NumPtrElts = cast(PtrTy)->getNumElements(); + // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to + // first cast the ptr vector to <2 x i64>. + assert(alignTo(TempIntSize, NumPtrElts) == TempIntSize && + "Vector size not divisble"); + Type *EltTy = Builder.getIntNTy(TempIntSize / NumPtrElts); + return Builder.CreateBitOrPointerCast( + Val, FixedVectorType::get(EltTy, NumPtrElts)); + }; + + Type *VecEltTy = VectorTy->getElementType(); + switch (Inst->getOpcode()) { + case Instruction::Load: { + // Loads can only be lowered if the value is known. + if (!CurVal) { + DeferredLoads.push_back(cast(Inst)); + return nullptr; + } + + Value *Index = calculateVectorIndex( + cast(Inst)->getPointerOperand(), GEPVectorIdx); + + // We're loading the full vector. + if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) { + assert(cast(Index)->isZeroValue()); + Type *InstTy = Inst->getType(); + if (InstTy->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, InstTy); + Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, InstTy); + Inst->replaceAllUsesWith(NewVal); + return nullptr; + } + + // We're loading one element. + Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index); + if (Inst->getType() != VecEltTy) + ExtractElement = + Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); + + Inst->replaceAllUsesWith(ExtractElement); + return nullptr; + } + case Instruction::Store: { + // For stores, it's a bit trickier and it depends on whether we're storing + // the full vector or not. If we're storing the full vector, we don't need + // to know the current value. If this is a store of a single element, we + // need to know the value. + StoreInst *SI = cast(Inst); + Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); + Value *Val = SI->getValueOperand(); + + // We're storing the full vector, we can handle this without knowing CurVal. + if (DL.getTypeStoreSize(Val->getType()) == VecStoreSize) { + assert(cast(Index)->isZeroValue()); + Type *SrcTy = Val->getType(); + if (SrcTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, SrcTy); + return Builder.CreateBitOrPointerCast(Val, VectorTy); + } + + if (Val->getType() != VecEltTy) + Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); + return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, + Index); + } + case Instruction::Call: { + if (auto *MTI = dyn_cast(Inst)) { + // For memcpy, we need to know curval. + ConstantInt *Length = cast(MTI->getLength()); + unsigned NumCopied = Length->getZExtValue() / ElementSize; + MemTransferInfo *TI = &TransferInfo[MTI]; + unsigned SrcBegin = TI->SrcIndex->getZExtValue(); + unsigned DestBegin = TI->DestIndex->getZExtValue(); + + SmallVector Mask; + for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { + Mask.push_back(SrcBegin++); + } else { + Mask.push_back(Idx); + } + } + + return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + } + + if (auto *MSI = dyn_cast(Inst)) { + // For memset, we don't need to know the previous value because we + // currently only allow memsets that cover the whole alloca. + Value *Elt = MSI->getOperand(1); + if (DL.getTypeStoreSize(VecEltTy) > 1) { + Value *EltBytes = + Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt); + Elt = Builder.CreateBitCast(EltBytes, VecEltTy); + } + + return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + } + + llvm_unreachable("Unsupported call when promoting alloca to vector"); + } + + default: + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + + llvm_unreachable("Did not return after promoting instruction!"); +} + +/// Iterates over an instruction worklist that may contain multiple instructions +/// from the same basic block, but in a different order. +template +static void forEachWorkListItem(const InstContainer &WorkList, + std::function Fn) { + // Bucket up uses of the alloca by the block they occur in. + // This is important because we have to handle multiple defs/uses in a block + // ourselves: SSAUpdater is purely for cross-block references. + DenseMap> UsesByBlock; + for (Instruction *User : WorkList) + UsesByBlock[User->getParent()].insert(User); + + for (Instruction *User : WorkList) { + BasicBlock *BB = User->getParent(); + auto &BlockUses = UsesByBlock[BB]; + + // Already processed, skip. + if (BlockUses.empty()) + continue; + + // Only user in the block, directly process it. + if (BlockUses.size() == 1) { + Fn(User); + continue; + } + + // Multiple users in the block, do a linear scan to see users in order. + for (Instruction &Inst : *BB) { + if (!BlockUses.contains(&Inst)) + continue; + + Fn(&Inst); + } + + // Clear the block so we know it's been processed. + BlockUses.clear(); + } +} + // FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); @@ -365,6 +570,7 @@ std::map GEPVectorIdx; SmallVector WorkList; + SmallVector UsersToRemove; SmallVector DeferredInsts; SmallVector Uses; DenseMap TransferInfo; @@ -393,12 +599,18 @@ return RejectUser(Inst, "pointer is being stored"); Type *AccessTy = getLoadStoreType(Inst); + if (AccessTy->isAggregateType()) + return RejectUser(Inst, "unsupported load/store as aggregate"); + assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); + Ptr = Ptr->stripPointerCasts(); - // Alloca already accessed as vector, leave alone. + // Alloca already accessed as vector. if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) + DL->getTypeStoreSize(AccessTy)) { + WorkList.push_back(Inst); continue; + } // Check that this is a simple access of a vector element. bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() @@ -416,6 +628,7 @@ // Look through bitcasts. for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); continue; } @@ -429,6 +642,7 @@ GEPVectorIdx[GEP] = Index; for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); continue; } @@ -481,13 +695,17 @@ } // Ignore assume-like intrinsics and comparisons used in assumes. - if (isAssumeLikeIntrinsic(Inst)) + if (isAssumeLikeIntrinsic(Inst)) { + UsersToRemove.push_back(Inst); continue; + } if (isa(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast(U)); - })) + })) { + UsersToRemove.push_back(Inst); continue; + } return RejectUser(Inst, "unhandled alloca user"); } @@ -505,75 +723,60 @@ LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); + const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); + + // Alloca is uninitialized memory. Imitate that by making the first value + // undef. + SSAUpdater Updater; + Updater.Initialize(VectorTy, "promotealloca"); + Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy)); + + // First handle the initial worklist. + SmallVector DeferredLoads; + forEachWorkListItem(WorkList, [&](Instruction *I) { + BasicBlock *BB = I->getParent(); + // On the first pass, we only take values that are trivially known, i.e. + // where AddAvailableValue was already called in this block. + Value *Result = promoteAllocaUserToVector( + I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, + Updater.FindValueForBlock(BB), DeferredLoads); + if (Result) + Updater.AddAvailableValue(BB, Result); + }); + + // Then handle deferred loads. + forEachWorkListItem(DeferredLoads, [&](Instruction *I) { + SmallVector NewDLs; + BasicBlock *BB = I->getParent(); + // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always + // get a value, inserting PHIs as needed. + Value *Result = promoteAllocaUserToVector( + I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, + Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); + if (Result) + Updater.AddAvailableValue(BB, Result); + assert(NewDLs.empty() && "No more deferred loads should be queued!"); + }); + + // Delete all instructions. On the first pass, new dummy loads may have been + // added so we need to collect them too. + DenseSet InstsToDelete(WorkList.begin(), WorkList.end()); + InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end()); + for (Instruction *I : InstsToDelete) { + assert(I->use_empty()); + I->eraseFromParent(); + } - for (Instruction *Inst : WorkList) { - IRBuilder<> Builder(Inst); - switch (Inst->getOpcode()) { - case Instruction::Load: { - Value *Ptr = cast(Inst)->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign()); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); - if (Inst->getType() != VecEltTy) - ExtractElement = - Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); - Inst->replaceAllUsesWith(ExtractElement); - Inst->eraseFromParent(); - break; - } - case Instruction::Store: { - StoreInst *SI = cast(Inst); - Value *Ptr = SI->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign()); - Value *Elt = SI->getValueOperand(); - if (Elt->getType() != VecEltTy) - Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); - Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign()); - Inst->eraseFromParent(); - break; - } - case Instruction::Call: { - if (const MemTransferInst *MTI = dyn_cast(Inst)) { - ConstantInt *Length = cast(MTI->getLength()); - unsigned NumCopied = Length->getZExtValue() / ElementSize; - MemTransferInfo *TI = &TransferInfo[cast(Inst)]; - unsigned SrcBegin = TI->SrcIndex->getZExtValue(); - unsigned DestBegin = TI->DestIndex->getZExtValue(); - - SmallVector Mask; - for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { - if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { - Mask.push_back(SrcBegin++); - } else { - Mask.push_back(Idx); - } - } - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign()); - Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask); - Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign()); - - Inst->eraseFromParent(); - } else if (MemSetInst *MSI = dyn_cast(Inst)) { - // Ensure the length parameter of the memsets matches the new vector - // type's. In general, the type size shouldn't change so this is a - // no-op, but it's better to be safe. - MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy))); - } else { - llvm_unreachable("Unsupported call when promoting alloca to vector"); - } - break; - } - - default: - llvm_unreachable("Inconsistency in instructions promotable to vector"); - } + // Delete all the users that are known to be removeable. + for (Instruction *I : reverse(UsersToRemove)) { + I->dropDroppableUses(); + assert(I->use_empty()); + I->eraseFromParent(); } + // Alloca should now be dead too. + assert(Alloca.use_empty()); + Alloca.eraseFromParent(); return true; } @@ -1061,7 +1264,7 @@ CurrentLocalMemUsage = NewSize; - std::vector WorkList; + std::vector WorkList; if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); @@ -1204,10 +1407,9 @@ assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); MemTransferInst *MI = cast(Intr); - auto *B = - Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(), - MI->getRawSource(), MI->getSourceAlign(), - MI->getLength(), MI->isVolatile()); + auto *B = Builder.CreateMemTransferInst( + ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(), + MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); for (unsigned I = 0; I != 2; ++I) { if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) { diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -13,37 +13,20 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s16, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v42, s16, 2 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s30, 0 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s31, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND @@ -57,9 +40,9 @@ ; GCN-NEXT: v_readlane_b32 s30, v42, 0 ; GCN-NEXT: v_readlane_b32 s4, v42, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xd000 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but @@ -16,19 +16,16 @@ define amdgpu_vs void @promote_1d_aggr() #0 { ; CHECK-LABEL: @promote_1d_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4 -; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0 +; CHECK-NEXT: [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]] ; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4 -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 ; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3 @@ -60,22 +57,12 @@ define amdgpu_vs void @promote_store_aggr() #0 { ; CHECK-LABEL: @promote_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0 -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1 -; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float +; CHECK-NEXT: [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0 +; CHECK-NEXT: [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1 ; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1 -; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4 +; CHECK-NEXT: store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4 ; CHECK-NEXT: store <4 x float> , ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; @@ -100,23 +87,18 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-LABEL: @promote_load_from_store_aggr( -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1 ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4 -; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]] -; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0 -; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2 -; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3 +; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0 +; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]] +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 +; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3 ; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; @@ -142,22 +124,7 @@ define amdgpu_vs void @promote_memmove_aggr() #0 { ; CHECK-LABEL: @promote_memmove_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -174,24 +141,12 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]] -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> , float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0 +; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -213,22 +168,7 @@ define amdgpu_vs void @promote_memcpy_identity_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_identity_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 -; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -248,8 +188,26 @@ ; CHECK-LABEL: @promote_memcpy_two_aggrs( ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) ; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] @@ -283,7 +241,16 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 { ; CHECK-LABEL: @promote_memcpy_p1p5_aggr( ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4 +; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4 +; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] @@ -305,21 +272,12 @@ define amdgpu_vs void @promote_memcpy_inline_aggr() #0 { ; CHECK-LABEL: @promote_memcpy_inline_aggr( -; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) -; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]] -; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> -; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0 -; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0 +; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4 ; CHECK-NEXT: ret void ; %f1 = alloca [5 x float], addrspace(5) @@ -347,30 +305,16 @@ define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-LABEL: @promote_double_aggr( -; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5) ; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8 ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 ; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8 ; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0 ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 -; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 -; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1 -; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1 -; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0 +; CHECK-NEXT: [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1 +; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]] +; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]] ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float ; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0 ; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1 @@ -410,21 +354,6 @@ define amdgpu_kernel void @alloca_struct() #0 { ; CHECK-LABEL: @alloca_struct( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -6,7 +6,7 @@ @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) { -; IR: alloca [10 x i32] +; IR-NOT: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16 ; ASM-NOT: .amdgpu_lds diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @test_overwrite +; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0 +; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0 +; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68 +; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ , [[ENTRY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [3 x i64], align 4, addrspace(5) + store i64 43, ptr addrspace(5) %stack + br i1 %cond, label %loop, label %end + +loop: + %load.0 = load i64, ptr addrspace(5) %stack + store i64 68, ptr addrspace(5) %stack + %load.1 = load i64, ptr addrspace(5) %stack + store i64 32, ptr addrspace(5) %stack + %loop.cc = icmp ne i64 %load.0, %load.1 + br i1 %loop.cc, label %loop, label %end + +end: + %reload = load i64, ptr addrspace(5) %stack + ret void +} + +define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite +; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0 +; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1 +; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32 +; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ , [[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [3 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1 + store i64 43, ptr addrspace(5) %stack + br i1 %cond, label %loop, label %end + +loop: + %load = load i64, ptr addrspace(5) %stack + store i64 32, ptr addrspace(5) %stack.1 + %loop.cc = icmp ne i64 %load, 32 + br i1 %loop.cc, label %loop, label %end + +end: + %reload = load i64, ptr addrspace(5) %stack + %reload.1 = load i64, ptr addrspace(5) %stack.1 + ret void +} + +define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) { +; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec +; CHECK-SAME: (ptr [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +; CHECK-NEXT: ret ptr [[TMP3]] +; +entry: + %alloca = alloca [8 x i8], align 8, addrspace(5) + store ptr %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load ptr, ptr addrspace(5) %alloca, align 8 + ret ptr %tmp +} + +define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) { +; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec +; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: ret ptr addrspace(3) [[TMP3]] +; +entry: + %alloca = alloca [4 x i8], align 8, addrspace(5) + store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8 + ret ptr addrspace(3) %tmp +} + +define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) { +; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec +; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)> +; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP2]] +; +entry: + %alloca = alloca [4 x i32], align 8, addrspace(5) + store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 + ret <4 x ptr addrspace(3)> %tmp +} + +; Currently rejected due to the store not being cast-able. +; TODO: We should probably be able to vectorize this +define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) { +; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec +; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 8, addrspace(5) +; CHECK-NEXT: store <2 x ptr addrspace(3)> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: [[TMP:%.*]] = load <2 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: [[TMP_FULL:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [8 x i32], align 8, addrspace(5) + store <2 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load <2 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 + %tmp.full = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 + ret void +} + +; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer. +define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) { +; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec +; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5) +; CHECK-NEXT: store ptr addrspace(3) [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: [[TMP:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: ret ptr addrspace(3) [[TMP]] +; +entry: + %alloca = alloca [8 x i8], align 8, addrspace(5) + store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8 + ret ptr addrspace(3) %tmp +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -1,19 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s ; Checks that memsets don't block PromoteAlloca. -; Note: memsets are just updated with the new type size. They are not eliminated which means -; the original alloca also stay. This puts a bit more load on SROA. -; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with -; e.g. ConstantAggregate. - define amdgpu_kernel void @memset_all_zero(i64 %val) { ; CHECK-LABEL: @memset_all_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 ; CHECK-NEXT: ret void ; entry: @@ -30,8 +24,7 @@ ; CHECK-LABEL: @memset_all_5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> , i64 [[VAL:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 ; CHECK-NEXT: ret void ; entry: @@ -47,11 +40,9 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) { ; CHECK-LABEL: @memset_volatile_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) -; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -64,11 +55,9 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) { ; CHECK-LABEL: @memset_badsize_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) -; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -81,8 +70,10 @@ define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) { ; CHECK-LABEL: @memset_offset_ptr_nopromote( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true) +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1 +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll @@ -4,15 +4,10 @@ define i64 @test_pointer_array(i64 %v) { ; OPT-LABEL: @test_pointer_array( ; OPT-NEXT: entry: -; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5) -; OPT-NEXT: [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr -; OPT-NEXT: [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0 -; OPT-NEXT: store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16 -; OPT-NEXT: [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0 -; OPT-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 -; OPT-NEXT: ret i64 [[TMP5]] +; OPT-NEXT: [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr +; OPT-NEXT: [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; OPT-NEXT: ret i64 [[TMP2]] ; entry: %a = alloca [3 x ptr], align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 @@ -11,11 +11,8 @@ ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, ; GCN: store_dword v{{.+}}, [[RES]] -; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x float> , ptr addrspace(5) %alloca, align 4 -; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x float> %0, i32 %sel2 -; OPT: store float %1, ptr addrspace(1) %out, align 4 +; OPT: %0 = extractelement <4 x float> , i32 %sel2 +; OPT: store float %0, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -46,12 +43,8 @@ ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] ; GCN: store_dwordx4 v{{.+}}, -; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2 -; OPT: store <4 x float> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4 -; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4 +; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2 +; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -77,11 +70,8 @@ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x half> , ptr addrspace(5) %alloca, align 2 -; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x half> %0, i32 %sel2 -; OPT: store half %1, ptr addrspace(1) %out, align 2 +; OPT: %0 = extractelement <4 x half> , i32 %sel2 +; OPT: store half %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -105,12 +95,8 @@ ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2 -; OPT: store <4 x half> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2 -; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2 +; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2 +; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -136,11 +122,8 @@ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: store <4 x i16> , ptr addrspace(5) %alloca, align 2 -; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca -; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2 -; OPT: store i16 %1, ptr addrspace(1) %out, align 2 +; OPT: %0 = extractelement <4 x i16> , i32 %sel2 +; OPT: store i16 %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -164,12 +147,8 @@ ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca -; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2 -; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca -; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2 -; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2 +; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2 +; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: @@ -193,8 +172,7 @@ ; GCN-NOT: buffer_ ; GCN: v_mov_b32_e32 v1, 0 -; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5) -; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8 +; OPT: ret i64 undef define i64 @ptr_alloca_bitcast() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll --- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll @@ -11,7 +11,7 @@ ; so that we do not need to fully unroll it. ; FUNC-LABEL: @private_memory -; LOOP-NOT: alloca +; LOOP-NOT: = alloca ; LOOP: loop.header: ; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s -; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s +; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s target datalayout = "A5" @@ -75,8 +76,8 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_float( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10 +; OPT: %promotealloca = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> ; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 @@ -84,24 +85,13 @@ ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float: ; GCN-ALLOCA: buffer_store_dword -; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16 -; GCN-PROMOTE-COUNT-6: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask ; GCN: s_cbranch ; GCN-ALLOCA: buffer_load_dword -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask - ; GCN-PROMOTE: ScratchSize: 0 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) { @@ -147,8 +137,8 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_double( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10 +; OPT: %promotealloca = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> ; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20 @@ -208,8 +198,8 @@ ; OPT-LABEL: @vector_write_read_bitcast_to_i64( ; OPT-NOT: alloca ; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9 +; OPT: %promotealloca = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x i64> %promotealloca, i64 %tmp6, i32 %tmp9 ; OPT: .preheader: ; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18 @@ -272,7 +262,7 @@ ; OPT: store i32 %0, ptr addrspace(1) %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume: -; GCN-COUNT-4: buffer_store_dword +; GCN-COUNT: buffer_store_dword define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) { entry: