diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -379,6 +379,11 @@ return ConstantInt::get(GEP->getContext(), Quot); } +struct MemTransferInfo { + ConstantInt *SrcIndex = nullptr; + ConstantInt *DestIndex = nullptr; +}; + static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, unsigned MaxVGPRs) { @@ -419,11 +424,15 @@ std::map GEPVectorIdx; SmallVector WorkList; + SmallVector DeferredInsts; SmallVector Uses; + DenseMap TransferInfo; + for (Use &U : Alloca->uses()) Uses.push_back(&U); Type *VecEltTy = VectorTy->getElementType(); + unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8; while (!Uses.empty()) { Use *U = Uses.pop_back_val(); Instruction *Inst = cast(U->getUser()); @@ -476,6 +485,47 @@ continue; } + if (MemTransferInst *TransferInst = dyn_cast(Inst)) { + if (TransferInst->isVolatile()) + return false; + + ConstantInt *Len = dyn_cast(TransferInst->getLength()); + if (!Len || !!(Len->getZExtValue() % ElementSize)) + return false; + + if (!TransferInfo.count(TransferInst)) { + DeferredInsts.push_back(Inst); + WorkList.push_back(Inst); + TransferInfo[TransferInst] = MemTransferInfo(); + } + + auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { + GetElementPtrInst *GEP = dyn_cast(Ptr); + if (Ptr != Alloca && !GEPVectorIdx.count(GEP)) + return nullptr; + + return dyn_cast(calculateVectorIndex(Ptr, GEPVectorIdx)); + }; + + unsigned OpNum = U->getOperandNo(); + MemTransferInfo *TI = &TransferInfo[TransferInst]; + if (OpNum == 0) { + Value *Dest = TransferInst->getDest(); + ConstantInt *Index = getPointerIndexOfAlloca(Dest); + if (!Index) + return false; + TI->DestIndex = Index; + } else { + assert(OpNum == 1); + Value *Src = TransferInst->getSource(); + ConstantInt *Index = getPointerIndexOfAlloca(Src); + if (!Index) + return false; + TI->SrcIndex = Index; + } + continue; + } + // Ignore assume-like intrinsics and comparisons used in assumes. if (isAssumeLikeIntrinsic(Inst)) continue; @@ -489,6 +539,16 @@ return false; } + while (!DeferredInsts.empty()) { + Instruction *Inst = DeferredInsts.pop_back_val(); + MemTransferInst *TransferInst = cast(Inst); + // TODO: Support the case if the pointers are from different alloca or + // from different address spaces. + MemTransferInfo &Info = TransferInfo[TransferInst]; + if (!Info.SrcIndex || !Info.DestIndex) + return false; + } + LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); @@ -525,6 +585,35 @@ Inst->eraseFromParent(); break; } + case Instruction::Call: { + if (const MemTransferInst *MTI = dyn_cast(Inst)) { + ConstantInt *Length = cast(MTI->getLength()); + unsigned NumCopied = Length->getZExtValue() / ElementSize; + MemTransferInfo *TI = &TransferInfo[cast(Inst)]; + unsigned SrcBegin = TI->SrcIndex->getZExtValue(); + unsigned DestBegin = TI->DestIndex->getZExtValue(); + + SmallVector Mask; + for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { + Mask.push_back(SrcBegin++); + } else { + Mask.push_back(Idx); + } + } + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); + Value *VecValue = + Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign()); + Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask); + Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign()); + + Inst->eraseFromParent(); + } else { + llvm_unreachable("Unsupported call when promoting alloca to vector"); + } + break; + } default: llvm_unreachable("Inconsistency in instructions promotable to vector"); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -140,6 +140,208 @@ ret void } +define amdgpu_vs void @promote_memmove_aggr() #0 { +; CHECK-LABEL: @promote_memmove_aggr( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 +; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 +; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 +; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1 + store float 1.0, ptr addrspace(5) %foo1 + %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 + store float 2.0, ptr addrspace(5) %foo2 + call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false) + %foo3 = load float, ptr addrspace(5) %f1 + store float %foo3, ptr addrspace(1) @pv + ret void +} + +define amdgpu_vs void @promote_memcpy_aggr() #0 { +; CHECK-LABEL: @promote_memcpy_aggr( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3 +; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 +; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + + %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 + store float 2.0, ptr addrspace(5) %foo2 + + %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 + %foo4 = load i32, ptr addrspace(1) %foo3 + %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + store float 3.0, ptr addrspace(5) %foo5 + + call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) + %foo6 = load float, ptr addrspace(5) %f1 + store float %foo6, ptr addrspace(1) @pv + ret void +} + +define amdgpu_vs void @promote_memcpy_identity_aggr() #0 { +; CHECK-LABEL: @promote_memcpy_identity_aggr( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 +; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 +; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 +; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1 + store float 1.0, ptr addrspace(5) %foo1 + %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 + store float 2.0, ptr addrspace(5) %foo2 + call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false) + %foo3 = load float, ptr addrspace(5) %f1 + store float %foo3, ptr addrspace(1) @pv + ret void +} + +; TODO: promote alloca even there is a memcpy between different alloca +define amdgpu_vs void @promote_memcpy_two_aggrs() #0 { +; CHECK-LABEL: @promote_memcpy_two_aggrs( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4 +; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false) +; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4 +; CHECK-NEXT: store float [[FOO7]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + %f2 = alloca [5 x float], addrspace(5) + + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + store [5 x float] zeroinitializer, ptr addrspace(5) %f2 + + %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 + %foo4 = load i32, ptr addrspace(1) %foo3 + %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + store float 3.0, ptr addrspace(5) %foo5 + + call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false) + + %foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4 + %foo7 = load float, ptr addrspace(5) %foo6 + store float %foo7, ptr addrspace(1) @pv + ret void +} + +; TODO: promote alloca even there is a memcpy between the alloca and other memory space. +define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 { +; CHECK-LABEL: @promote_memcpy_p1p5_aggr( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4 +; CHECK-NEXT: call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false) +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + + %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 + %foo4 = load i32, ptr addrspace(1) %foo3 + %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + store float 3.0, ptr addrspace(5) %foo5 + + call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false) + ret void +} + +define amdgpu_vs void @promote_memcpy_inline_aggr() #0 { +; CHECK-LABEL: @promote_memcpy_inline_aggr( +; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) +; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 +; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]] +; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0 +; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4 +; CHECK-NEXT: ret void +; + %f1 = alloca [5 x float], addrspace(5) + store [5 x float] zeroinitializer, ptr addrspace(5) %f1 + + %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 + %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 + %foo4 = load i32, ptr addrspace(1) %foo3 + %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + store float 3.0, ptr addrspace(5) %foo5 + + call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) + %foo6 = load float, ptr addrspace(5) %f1 + store float %foo6, ptr addrspace(1) @pv + ret void +} + +declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) + @tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> } @frag_color = external addrspace(1) global <4 x float>