Diff 487604

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Show First 20 Lines • Show All 373 Lines • ▼ Show 20 Lines	static Value GEPToVectorIndex(GetElementPtrInst GEP, AllocaInst *Alloca,
uint64_t Rem;		uint64_t Rem;
APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);		APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
if (Rem != 0)		if (Rem != 0)
return nullptr;		return nullptr;

return ConstantInt::get(GEP->getContext(), Quot);		return ConstantInt::get(GEP->getContext(), Quot);
}		}

		struct MemTransferInfo {
		ConstantInt *SrcIndex = nullptr;
		ConstantInt *DestIndex = nullptr;
		};

static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,		static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
unsigned MaxVGPRs) {		unsigned MaxVGPRs) {

if (DisablePromoteAllocaToVector) {		if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");		LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;		return false;
}		}

Show All 24 Lines	static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|		if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|
VectorTy->getNumElements() < 2) {		VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");		LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;		return false;
}		}

std::map<GetElementPtrInst, Value> GEPVectorIdx;		std::map<GetElementPtrInst, Value> GEPVectorIdx;
SmallVector<Instruction *> WorkList;		SmallVector<Instruction *> WorkList;
		SmallVector<Instruction *> DeferredInsts;
SmallVector<Use *, 8> Uses;		SmallVector<Use *, 8> Uses;
		DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;

for (Use &U : Alloca->uses())		for (Use &U : Alloca->uses())
Uses.push_back(&U);		Uses.push_back(&U);

Type *VecEltTy = VectorTy->getElementType();		Type *VecEltTy = VectorTy->getElementType();
		unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8;
while (!Uses.empty()) {		while (!Uses.empty()) {
Use *U = Uses.pop_back_val();		Use *U = Uses.pop_back_val();
Instruction *Inst = cast<Instruction>(U->getUser());		Instruction *Inst = cast<Instruction>(U->getUser());

if (Value *Ptr = getLoadStorePointerOperand(Inst)) {		if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
// This is a store of the pointer, not to the pointer.		// This is a store of the pointer, not to the pointer.
if (isa<StoreInst>(Inst) &&		if (isa<StoreInst>(Inst) &&
U->getOperandNo() != StoreInst::getPointerOperandIndex())		U->getOperandNo() != StoreInst::getPointerOperandIndex())
Show All 36 Lines	if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
}		}

GEPVectorIdx[GEP] = Index;		GEPVectorIdx[GEP] = Index;
for (Use &U : Inst->uses())		for (Use &U : Inst->uses())
Uses.push_back(&U);		Uses.push_back(&U);
continue;		continue;
}		}

		if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
		if (TransferInst->isVolatile())
		return false;

		ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
		if (!Len \|\| !!(Len->getZExtValue() % ElementSize))
		return false;

		if (!TransferInfo.count(TransferInst)) {
		DeferredInsts.push_back(Inst);
		WorkList.push_back(Inst);
		TransferInfo[TransferInst] = MemTransferInfo();
		}

		auto getPointerIndexOfAlloca = [&](Value Ptr) -> ConstantInt {
		arsenmUnsubmitted Not Done Reply Inline Actions [=]? arsenm: [=]?
		ruilingAuthorUnsubmitted Done Reply Inline Actions We are capturing `GEPVectorIdx` and `Alloca`, so I think capture by reference is preferred over by copy here. Please correct me if I am wrong. ruiling: We are capturing `GEPVectorIdx` and `Alloca`, so I think capture by reference is preferred over…
		GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
		if (Ptr != Alloca && !GEPVectorIdx.count(GEP))
		return nullptr;

		return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
		};
		arsenmUnsubmitted Not Done Reply Inline Actions Extra ; arsenm: Extra ;
		ruilingAuthorUnsubmitted Done Reply Inline Actions I guess you might misread here? I think we always put a semicolon at the end of lambda expression. ruiling: I guess you might misread here? I think we always put a semicolon at the end of lambda…

		unsigned OpNum = U->getOperandNo();
		MemTransferInfo *TI = &TransferInfo[TransferInst];
		if (OpNum == 0) {
		Value *Dest = TransferInst->getDest();
		ConstantInt *Index = getPointerIndexOfAlloca(Dest);
		if (!Index)
		return false;
		TI->DestIndex = Index;
		} else {
		assert(OpNum == 1);
		Value *Src = TransferInst->getSource();
		ConstantInt *Index = getPointerIndexOfAlloca(Src);
		if (!Index)
		return false;
		TI->SrcIndex = Index;
		}
		continue;
		}

// Ignore assume-like intrinsics and comparisons used in assumes.		// Ignore assume-like intrinsics and comparisons used in assumes.
if (isAssumeLikeIntrinsic(Inst))		if (isAssumeLikeIntrinsic(Inst))
continue;		continue;

if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {		if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
return isAssumeLikeIntrinsic(cast<Instruction>(U));		return isAssumeLikeIntrinsic(cast<Instruction>(U));
}))		}))
continue;		continue;

// Unknown user.		// Unknown user.
return false;		return false;
}		}

		while (!DeferredInsts.empty()) {
		Instruction *Inst = DeferredInsts.pop_back_val();
		MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
		foadUnsubmitted Not Done Reply Inline Actions This can be `cast` instead of `dyn_cast`. foad: This can be `cast` instead of `dyn_cast`.
		// TODO: Support the case if the pointers are from different alloca or
		// from different address spaces.
		MemTransferInfo &Info = TransferInfo[TransferInst];
		if (!Info.SrcIndex \|\| !Info.DestIndex)
		return false;
		}

LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "		LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');		<< *VectorTy << '\n');

for (Instruction *Inst : WorkList) {		for (Instruction *Inst : WorkList) {
IRBuilder<> Builder(Inst);		IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {		switch (Inst->getOpcode()) {
case Instruction::Load: {		case Instruction::Load: {
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();		Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Show All 20 Lines	case Instruction::Store: {
Value *Elt = SI->getValueOperand();		Value *Elt = SI->getValueOperand();
if (Elt->getType() != VecEltTy)		if (Elt->getType() != VecEltTy)
Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);		Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);		Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());		Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
Inst->eraseFromParent();		Inst->eraseFromParent();
break;		break;
}		}
		case Instruction::Call: {
		if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
		switch (II->getIntrinsicID()) {
		default:
		llvm_unreachable(
		"Unsupported intrinsic when promoting alloca to vector");
		case Intrinsic::memcpy:
		case Intrinsic::memcpy_inline:
		case Intrinsic::memmove: {
		arsenmUnsubmitted Not Done Reply Inline Actions probably should dyn_cast to MemIntrinsicInst here to defend agains these becoming out of sync arsenm: probably should dyn_cast to MemIntrinsicInst here to defend agains these becoming out of sync
		ruilingAuthorUnsubmitted Done Reply Inline Actions Agree, I have changed to cast to catch the possible broken case. ruiling: Agree, I have changed to cast to catch the possible broken case.
		arsenmUnsubmitted Done Reply Inline Actions This still has the switch over specific IDs instead of the matching cast arsenm: This still has the switch over specific IDs instead of the matching cast
		const MemTransferInst *MTI = cast<MemTransferInst>(II);
		ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
		unsigned NumCopied = Length->getZExtValue() / ElementSize;
		MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
		unsigned SrcBegin = TI->SrcIndex->getZExtValue();
		unsigned DestBegin = TI->DestIndex->getZExtValue();

		SmallVector<int> Mask;
		for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
		if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
		Mask.push_back(SrcBegin++);
		} else {
		Mask.push_back(Idx);
		}
		}
		Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
		arsenmUnsubmitted Not Done Reply Inline Actions No reason to care about type pointers anymore arsenm: No reason to care about type pointers anymore
		ruilingAuthorUnsubmitted Done Reply Inline Actions Our graphics compiler has not fully switched to opaque pointers yet. @foad am I right? ruiling: Our graphics compiler has not fully switched to opaque pointers yet. @foad am I right?
		foadUnsubmitted Not Done Reply Inline Actions Yes. foad: Yes.
		arsenmUnsubmitted Not Done Reply Inline Actions Well, that needs to be fixed soon since code is going to start getting ripped out very shortly arsenm: Well, that needs to be fixed soon since code is going to start getting ripped out very shortly
		foadUnsubmitted Not Done Reply Inline Actions We have top engineers working on it! https://github.com/GPUOpen-Drivers/llpc/pull/2210 foad: We have top engineers working on it! https://github.com/GPUOpen-Drivers/llpc/pull/2210
		Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
		Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
		Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
		Builder.CreateStore(NewVecValue, BitCast);

		Inst->eraseFromParent();
		break;
		}
		}
		} else {
		llvm_unreachable("Unsupported call when promoting alloca to vector");
		}
		break;
		}

default:		default:
llvm_unreachable("Inconsistency in instructions promotable to vector");		llvm_unreachable("Inconsistency in instructions promotable to vector");
}		}
}		}
return true;		return true;
}		}

▲ Show 20 Lines • Show All 609 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll

Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines	;
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0		%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1		%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2		%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3		%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
store <4 x float> %foo12, ptr addrspace(1) @pv		store <4 x float> %foo12, ptr addrspace(1) @pv
ret void		ret void
}		}

		define amdgpu_vs void @promote_memmove_aggr() #0 {
		; CHECK-LABEL: @promote_memmove_aggr(
		; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
		; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
		; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
		; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
		; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
		; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
		; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4
		; CHECK-NEXT: ret void
		;
		%f1 = alloca [5 x float], addrspace(5)
		store [5 x float] zeroinitializer, ptr addrspace(5) %f1
		%foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
		store float 1.0, ptr addrspace(5) %foo1
		%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
		store float 2.0, ptr addrspace(5) %foo2
		call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
		%foo3 = load float, ptr addrspace(5) %f1
		store float %foo3, ptr addrspace(1) @pv
		ret void
		}

		define amdgpu_vs void @promote_memcpy_aggr() #0 {
		; CHECK-LABEL: @promote_memcpy_aggr(
		; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
		; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
		; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO3:%.]] = getelementptr [[BLOCK3:%.]], ptr addrspace(1) @block3, i32 0, i32 0
		; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
		; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
		; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
		; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
		; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
		; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4
		; CHECK-NEXT: ret void
		;
		%f1 = alloca [5 x float], addrspace(5)
		store [5 x float] zeroinitializer, ptr addrspace(5) %f1

		%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
		store float 2.0, ptr addrspace(5) %foo2

		%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
		%foo4 = load i32, ptr addrspace(1) %foo3
		%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
		store float 3.0, ptr addrspace(5) %foo5

		call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
		%foo6 = load float, ptr addrspace(5) %f1
		store float %foo6, ptr addrspace(1) @pv
		ret void
		}

		; TODO: promote alloca even there is a memcpy between different alloca
		define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
		; CHECK-LABEL: @promote_memcpy_two_aggrs(
		; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
		arsenmUnsubmitted Not Done Reply Inline Actions Needs tests for the different address space cases you mentioned in the todo arsenm: Needs tests for the different address space cases you mentioned in the todo
		arsenmUnsubmitted Not Done Reply Inline Actions also memcpy_inline arsenm: also memcpy_inline
		ruilingAuthorUnsubmitted Done Reply Inline Actions Good idea, I have added all of them. ruiling: Good idea, I have added all of them.
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
		; CHECK-NEXT: [[FOO3:%.]] = getelementptr [[BLOCK3:%.]], ptr addrspace(1) @block3, i32 0, i32 0
		; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
		; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
		; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
		; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
		; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]]
		; CHECK-NEXT: [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4
		; CHECK-NEXT: store float [[FOO7]], ptr addrspace(1) @pv, align 4
		; CHECK-NEXT: ret void
		;
		%f1 = alloca [5 x float], addrspace(5)
		%f2 = alloca [5 x float], addrspace(5)

		store [5 x float] zeroinitializer, ptr addrspace(5) %f1
		store [5 x float] zeroinitializer, ptr addrspace(5) %f2

		%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
		%foo4 = load i32, ptr addrspace(1) %foo3
		%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
		store float 3.0, ptr addrspace(5) %foo5

		call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)

		%foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
		%foo7 = load float, ptr addrspace(5) %foo6
		store float %foo7, ptr addrspace(1) @pv
		ret void
		}

		; TODO: promote alloca even there is a memcpy between the alloca and other memory space.
		define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
		; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
		; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO3:%.]] = getelementptr [[BLOCK3:%.]], ptr addrspace(1) @block3, i32 0, i32 0
		; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
		; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
		; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
		; CHECK-NEXT: call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
		; CHECK-NEXT: ret void
		;
		%f1 = alloca [5 x float], addrspace(5)
		store [5 x float] zeroinitializer, ptr addrspace(5) %f1

		%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
		%foo4 = load i32, ptr addrspace(1) %foo3
		%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
		store float 3.0, ptr addrspace(5) %foo5

		call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
		ret void
		}

		define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
		; CHECK-LABEL: @promote_memcpy_inline_aggr(
		; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
		; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
		; CHECK-NEXT: [[FOO3:%.]] = getelementptr [[BLOCK3:%.]], ptr addrspace(1) @block3, i32 0, i32 0
		; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
		; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
		; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
		; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
		; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 32
		; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
		; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
		; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4
		; CHECK-NEXT: ret void
		;
		%f1 = alloca [5 x float], addrspace(5)
		store [5 x float] zeroinitializer, ptr addrspace(5) %f1

		%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
		%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
		%foo4 = load i32, ptr addrspace(1) %foo3
		%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
		store float 3.0, ptr addrspace(5) %foo5

		call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
		%foo6 = load float, ptr addrspace(5) %f1
		store float %foo6, ptr addrspace(1) @pv
		ret void
		}

		arsenmUnsubmitted Done Reply Inline Actions Another edge case to maybe test is the identity copy of the full alloca to itself arsenm: Another edge case to maybe test is the identity copy of the full alloca to itself
		ruilingAuthorUnsubmitted Done Reply Inline Actions I have added a case for it, but would not bother to optimize for it as such code pattern are trivially optimized away early. ruiling: I have added a case for it, but would not bother to optimize for it as such code pattern are…
		declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
		declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
		declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
		declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)

@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }		@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
@frag_color = external addrspace(1) global <4 x float>		@frag_color = external addrspace(1) global <4 x float>

define amdgpu_ps void @promote_double_aggr() #0 {		define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-LABEL: @promote_double_aggr(		; CHECK-LABEL: @promote_double_aggr(
; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)		; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0		; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8		; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Promote array alloca if used by memmove/memcpy
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 487604

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Promote array alloca if used by memmove/memcpyClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 487604

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll

AMDGPU: Promote array alloca if used by memmove/memcpy
ClosedPublic