diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" @@ -384,6 +385,28 @@ ConstantInt *DestIndex = nullptr; }; +// Checks if the instruction I is a memset user of the alloca AI that we can +// deal with. Currently, only non-volatile memsets that affect the whole alloca +// are handled. +static bool isSupportedMemset(Instruction *I, AllocaInst *AI, + unsigned AllocaSize) { + using namespace PatternMatch; + Value *Ptr = nullptr; + uint64_t Len; + uint64_t ByteVal; + if (!match(I, m_Intrinsic(m_Value(Ptr), + m_ConstantInt(ByteVal), + m_ConstantInt(Len), m_Zero())) && + !match(I, m_Intrinsic( + m_Value(Ptr), m_ConstantInt(ByteVal), m_ConstantInt(Len), + m_Zero()))) + return false; + + // For now we only care about memsets that affect the whole type (start at + // index 0 and fill the whole alloca). + return Ptr == AI && Len == AllocaSize; +} + static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, unsigned MaxVGPRs) { @@ -535,6 +558,11 @@ })) continue; + if (isSupportedMemset(Inst, Alloca, DL.getTypeSizeInBits(AllocaTy) / 8)) { + WorkList.push_back(Inst); + continue; + } + // Unknown user. return false; } @@ -586,6 +614,7 @@ break; } case Instruction::Call: { + IntrinsicInst *II = dyn_cast(Inst); if (const MemTransferInst *MTI = dyn_cast(Inst)) { ConstantInt *Length = cast(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; @@ -609,6 +638,12 @@ Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign()); Inst->eraseFromParent(); + } else if (II->getIntrinsicID() == Intrinsic::memset || + II->getIntrinsicID() == Intrinsic::memset_inline) { + // Ensure the length parameter of the memsets matches the new vector + // type's. In general, the type size shouldn't change so this is a + // no-op, but it's better to be safe. + II->setOperand(2, Builder.getInt64(DL.getTypeSizeInBits(VectorTy) / 8)); } else { llvm_unreachable("Unsupported call when promoting alloca to vector"); } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -0,0 +1,62 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +; Checks that memsets don't block PromoteAlloca. + +; Note: memsets are just updated with the new type size. They are not eliminated which means +; the original alloca also stay. This puts a bit more load on SROA. +; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with +; e.g. ConstantAggregate. + +; CHECK-LABEL: @memset_all_zero( +; CHECK: <6 x double> +define amdgpu_kernel void @memset_all_zero(double %val) { +entry: + %stack = alloca [6 x double], align 4, addrspace(5) + call void @llvm.memset.p5.double(ptr addrspace(5) %stack, i8 0, i64 48, i1 false) + store double %val, ptr addrspace(5) %stack + ret void +} + +; CHECK-LABEL: @memset_all_5( +; CHECK: <4 x i64> +define amdgpu_kernel void @memset_all_5(i64 %val) { +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 5, i64 32, i1 false) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +; CHECK-LABEL: @memset_volatile_nopromote( +; CHECK-NOT: <4 x i64> +define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) { +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 32, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +; CHECK-LABEL: @memset_badsize_nopromote( +; CHECK-NOT: <4 x i64> +define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) { +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 31, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +; CHECK-LABEL: @memset_offset_ptr_nopromote( +; CHECK-NOT: <4 x i64> +define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) { +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %gep = getelementptr [4 x i64], ptr addrspace(5) %stack, i64 0, i64 1 + call void @llvm.memset.p5.i64(ptr addrspace(5) %gep, i8 0, i64 24, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +declare void @llvm.memset.p5.double(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)