This is an archive of the discontinued LLVM Phabricator instance.

Differential D79641

[AMDGPU] Vectorize alloca thru bitcast
ClosedPublic

Authored by rampitec on May 8 2020, 11:27 AM.

Download Raw Diff

Details

Reviewers

arsenm

Commits

rGdb7dea2b6f7f: [AMDGPU] Vectorize alloca thru bitcast

Summary

This is mostly useful if alloca element type is not integer
and then casted to an integer for load or store. We now can
vectorize an [i32] alloca but cannot do so for [float].

There also a separate patch needed to properly lower 64 bit
types after they vectorized. At the moment these are lowered
via scratch anyway.

Diff Detail

Event Timeline

rampitec created this revision.May 8 2020, 11:27 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 8 2020, 11:27 AM

Herald added subscribers: kerbowa, hiraditya, t-tye and 7 others. · View Herald Transcript

PSDB passed.

arsenm added inline comments.May 8 2020, 11:53 AM

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
310	I think this needs to be careful around multiple uses
435	Needs test with assume intrinsic
481	IRBuilder does this check for you so you can omit it
llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
28	Needs tests with multiple uses

Added more tests.

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
310	We just need to get to the actual pointer here, it does not matter if there are multiple uses, bitcasts are not removed anyway. Also note that is not a problem if not all uses are converted, alloca itself stays. The pass does partial vectorization even now.
435	It's the same as lifetime instrinsics in the test, but I will add one. Just note that a pointer cannot be passed into assume, it shall be a compare, so it will prevent alloca removal.
481	Actually it does not: Assertion `New->getType() == getType() && "replaceAllUses of value with new value of different type!"' failed.

arsenm accepted this revision.May 8 2020, 2:47 PM

This revision is now accepted and ready to land.May 8 2020, 2:47 PM

Closed by commit rGdb7dea2b6f7f: [AMDGPU] Vectorize alloca thru bitcast (authored by rampitec). · Explain WhyMay 8 2020, 3:36 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUPromoteAlloca.cpp

90 lines

test/

CodeGen/

AMDGPU/

vector-alloca-bitcast.ll

275 lines

Diff 262918

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Show First 20 Lines • Show All 296 Lines • ▼ Show 20 Lines	Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
return CI;		return CI;
}		}

static VectorType arrayTypeToVecType(ArrayType ArrayTy) {		static VectorType arrayTypeToVecType(ArrayType ArrayTy) {
return VectorType::get(ArrayTy->getElementType(),		return VectorType::get(ArrayTy->getElementType(),
ArrayTy->getNumElements());		ArrayTy->getNumElements());
}		}

		static Value stripBitcasts(Value V) {
		while (Instruction *I = dyn_cast<Instruction>(V)) {
		if (I->getOpcode() != Instruction::BitCast)
		break;
		V = I->getOperand(0);
		}
		arsenmUnsubmitted Done Reply Inline Actions I think this needs to be careful around multiple uses arsenm: I think this needs to be careful around multiple uses
		rampitecAuthorUnsubmitted Done Reply Inline Actions We just need to get to the actual pointer here, it does not matter if there are multiple uses, bitcasts are not removed anyway. Also note that is not a problem if not all uses are converted, alloca itself stays. The pass does partial vectorization even now. rampitec: We just need to get to the actual pointer here, it does not matter if there are multiple uses…
		return V;
		}

static Value *		static Value *
calculateVectorIndex(Value *Ptr,		calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst , Value > &GEPIdx) {		const std::map<GetElementPtrInst , Value > &GEPIdx) {
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);		GetElementPtrInst *GEP = cast<GetElementPtrInst>(stripBitcasts(Ptr));

auto I = GEPIdx.find(GEP);		auto I = GEPIdx.find(GEP);
return I == GEPIdx.end() ? nullptr : I->second;		return I == GEPIdx.end() ? nullptr : I->second;
}		}

static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {		static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// FIXME we only support simple cases		// FIXME we only support simple cases
if (GEP->getNumOperands() != 3)		if (GEP->getNumOperands() != 3)
return nullptr;		return nullptr;

ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));		ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
if (!I0 \|\| !I0->isZero())		if (!I0 \|\| !I0->isZero())
return nullptr;		return nullptr;

return GEP->getOperand(2);		return GEP->getOperand(2);
}		}

// Not an instruction handled below to turn into a vector.		// Not an instruction handled below to turn into a vector.
//		//
// TODO: Check isTriviallyVectorizable for calls and handle other		// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.		// instructions.
static bool canVectorizeInst(Instruction Inst, User User) {		static bool canVectorizeInst(Instruction Inst, User User,
		const DataLayout &DL) {
switch (Inst->getOpcode()) {		switch (Inst->getOpcode()) {
case Instruction::Load: {		case Instruction::Load: {
// Currently only handle the case where the Pointer Operand is a GEP.		// Currently only handle the case where the Pointer Operand is a GEP.
// Also we could not vectorize volatile or atomic loads.		// Also we could not vectorize volatile or atomic loads.
LoadInst *LI = cast<LoadInst>(Inst);		LoadInst *LI = cast<LoadInst>(Inst);
if (isa<AllocaInst>(User) &&		if (isa<AllocaInst>(User) &&
LI->getPointerOperandType() == User->getType() &&		LI->getPointerOperandType() == User->getType() &&
isa<VectorType>(LI->getType()))		isa<VectorType>(LI->getType()))
return true;		return true;
return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
		Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
		if (!PtrInst)
		return false;

		return (PtrInst->getOpcode() == Instruction::GetElementPtr \|\|
		PtrInst->getOpcode() == Instruction::BitCast) &&
		LI->isSimple();
}		}
case Instruction::BitCast:		case Instruction::BitCast:
return true;		return true;
case Instruction::Store: {		case Instruction::Store: {
// Must be the stored pointer operand, not a stored value, plus		// Must be the stored pointer operand, not a stored value, plus
// since it should be canonical form, the User should be a GEP.		// since it should be canonical form, the User should be a GEP.
// Also we could not vectorize volatile or atomic stores.		// Also we could not vectorize volatile or atomic stores.
StoreInst *SI = cast<StoreInst>(Inst);		StoreInst *SI = cast<StoreInst>(Inst);
if (isa<AllocaInst>(User) &&		if (isa<AllocaInst>(User) &&
SI->getPointerOperandType() == User->getType() &&		SI->getPointerOperandType() == User->getType() &&
isa<VectorType>(SI->getValueOperand()->getType()))		isa<VectorType>(SI->getValueOperand()->getType()))
return true;		return true;
return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
		Instruction *UserInst = dyn_cast<Instruction>(User);
		if (!UserInst)
		return false;

		return (SI->getPointerOperand() == User) &&
		(UserInst->getOpcode() == Instruction::GetElementPtr \|\|
		UserInst->getOpcode() == Instruction::BitCast) &&
		SI->isSimple();
}		}
default:		default:
return false;		return false;
}		}
}		}

static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {		static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {

if (DisablePromoteAllocaToVector) {		if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");		LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;		return false;
}		}

Type *AllocaTy = Alloca->getAllocatedType();		Type *AllocaTy = Alloca->getAllocatedType();
VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);		VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
Show All 11 Lines	static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
// could also be promoted but we don't currently handle this case		// could also be promoted but we don't currently handle this case
if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|		if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|
VectorTy->getNumElements() < 2) {		VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");		LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;		return false;
}		}

std::map<GetElementPtrInst, Value> GEPVectorIdx;		std::map<GetElementPtrInst, Value> GEPVectorIdx;
std::vector<Value*> WorkList;		std::vector<Value *> WorkList;
for (User *AllocaUser : Alloca->users()) {		SmallVector<User *, 8> Users(Alloca->users());
		SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
		Type *VecEltTy = VectorTy->getElementType();
		while (!Users.empty()) {
		User *AllocaUser = Users.pop_back_val();
		User *UseUser = UseUsers.pop_back_val();
		Instruction *Inst = dyn_cast<Instruction>(AllocaUser);

GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);		GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
if (!GEP) {		if (!GEP) {
if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))		if (!canVectorizeInst(Inst, UseUser, DL))
return false;		return false;

		if (Inst->getOpcode() == Instruction::BitCast) {
		Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
		Type *ToTy = Inst->getType()->getPointerElementType();
		if (FromTy->isAggregateType() \|\| ToTy->isAggregateType() \|\|
		DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
		continue;

		for (User *CastUser : Inst->users()) {
		if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
		arsenmUnsubmitted Done Reply Inline Actions Needs test with assume intrinsic arsenm: Needs test with assume intrinsic
		rampitecAuthorUnsubmitted Done Reply Inline Actions It's the same as lifetime instrinsics in the test, but I will add one. Just note that a pointer cannot be passed into assume, it shall be a compare, so it will prevent alloca removal. rampitec: It's the same as lifetime instrinsics in the test, but I will add one. Just note that a pointer…
		continue;
		Users.push_back(CastUser);
		UseUsers.push_back(Inst);
		}

		continue;
		}

WorkList.push_back(AllocaUser);		WorkList.push_back(AllocaUser);
continue;		continue;
}		}

Value *Index = GEPToVectorIndex(GEP);		Value *Index = GEPToVectorIndex(GEP);

// If we can't compute a vector index from this GEP, then we can't		// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.		// promote this alloca to vector.
if (!Index) {		if (!Index) {
LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP		LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
<< '\n');		<< '\n');
return false;		return false;
}		}

GEPVectorIdx[GEP] = Index;		GEPVectorIdx[GEP] = Index;
for (User *GEPUser : AllocaUser->users()) {		Users.append(GEP->user_begin(), GEP->user_end());
if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))		UseUsers.append(GEP->getNumUses(), GEP);
return false;

WorkList.push_back(GEPUser);
}
}		}

LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "		LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');		<< *VectorTy << '\n');

for (Value *V : WorkList) {		for (Value *V : WorkList) {
Instruction *Inst = cast<Instruction>(V);		Instruction *Inst = cast<Instruction>(V);
IRBuilder<> Builder(Inst);		IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {		switch (Inst->getOpcode()) {
case Instruction::Load: {		case Instruction::Load: {
if (Inst->getType() == AllocaTy)		if (Inst->getType() == AllocaTy)
break;		break;

Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);		Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();		Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);		Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);

Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);		Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);		Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);		Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
		if (Inst->getType() != VecEltTy)
		arsenmUnsubmitted Done Reply Inline Actions IRBuilder does this check for you so you can omit it arsenm: IRBuilder does this check for you so you can omit it
		rampitecAuthorUnsubmitted Done Reply Inline Actions Actually it does not: Assertion `New->getType() == getType() && "replaceAllUses of value with new value of different type!"' failed. rampitec: Actually it does not: Assertion `New->getType() == getType() && "replaceAllUses of value with…
		ExtractElement = Builder.CreateBitCast(ExtractElement, Inst->getType());
Inst->replaceAllUsesWith(ExtractElement);		Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();		Inst->eraseFromParent();
break;		break;
}		}
case Instruction::Store: {		case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(Inst);		StoreInst *SI = cast<StoreInst>(Inst);
if (SI->getValueOperand()->getType() == AllocaTy)		if (SI->getValueOperand()->getType() == AllocaTy)
break;		break;

Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);		Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = SI->getPointerOperand();		Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);		Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);		Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);		Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,		Value *Elt = SI->getValueOperand();
SI->getValueOperand(),		if (Elt->getType() != VecEltTy)
Index);		Elt = Builder.CreateBitCast(Elt, VecEltTy);
		Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
Builder.CreateStore(NewVecValue, BitCast);		Builder.CreateStore(NewVecValue, BitCast);
Inst->eraseFromParent();		Inst->eraseFromParent();
break;		break;
}		}
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
break;

default:		default:
llvm_unreachable("Inconsistency in instructions promotable to vector");		llvm_unreachable("Inconsistency in instructions promotable to vector");
}		}
}		}
return true;		return true;
}		}

▲ Show 20 Lines • Show All 248 Lines • ▼ Show 20 Lines

// FIXME: Should try to pick the most likely to be profitable allocas first.		// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {		bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// Array allocations are probably not worth handling, since an allocation of		// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.		// the array type is the canonical form.
if (!I.isStaticAlloca() \|\| I.isArrayAllocation())		if (!I.isStaticAlloca() \|\| I.isArrayAllocation())
return false;		return false;

		const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);

// First try to replace the alloca with a vector		// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();		Type *AllocaTy = I.getAllocatedType();

LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');		LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');

if (tryPromoteAllocaToVector(&I))		if (tryPromoteAllocaToVector(&I, DL))
return true; // Promoted to vector.		return true; // Promoted to vector.

if (DisablePromoteAllocaToLDS)		if (DisablePromoteAllocaToLDS)
return false;		return false;

const Function &ContainingFunction = *I.getParent()->getParent();		const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();		CallingConv::ID CC = ContainingFunction.getCallingConv();

Show All 13 Lines	bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {

// Not likely to have sufficient local memory for promotion.		// Not likely to have sufficient local memory for promotion.
if (!SufficientLDS)		if (!SufficientLDS)
return false;		return false;

const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);		const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;		unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;

const DataLayout &DL = Mod->getDataLayout();

unsigned Align = I.getAlignment();		unsigned Align = I.getAlignment();
if (Align == 0)		if (Align == 0)
Align = DL.getABITypeAlignment(I.getAllocatedType());		Align = DL.getABITypeAlignment(I.getAllocatedType());

// FIXME: This computed padding is likely wrong since it depends on inverse		// FIXME: This computed padding is likely wrong since it depends on inverse
// usage order.		// usage order.
//		//
// FIXME: It is also possible that if we're allowed to use all of the memory		// FIXME: It is also possible that if we're allowed to use all of the memory
▲ Show 20 Lines • Show All 172 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll

This file was added.

				; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s \| FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s \| FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s \| FileCheck -check-prefix=OPT %s

				target datalayout = "A5"

				; OPT-LABEL: @vector_read_alloca_bitcast(
				; OPT-NOT: alloca
				; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
				; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4

				; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
				; GCN-ALLOCA-COUNT-4: buffer_store_dword
				; GCN-ALLOCA: buffer_load_dword

				; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
				; GCN-PROMOTE: v_cndmask_b32_e{{32\|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
				; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC2:[^,]+]], s{{[0-9]+}}, 2
				; GCN-PROMOTE: v_cndmask_b32_e{{32\|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]]
				; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC3:[^,]+]], s{{[0-9]+}}, 3
				; GCN-PROMOTE: v_cndmask_b32_e{{32\|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]]

				; GCN-PROMOTE: ScratchSize: 0

				define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) {
				entry:
				%tmp = alloca [4 x i32], addrspace(5)
				%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
				arsenmUnsubmitted Done Reply Inline Actions Needs tests with multiple uses arsenm: Needs tests with multiple uses
				%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
				%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
				%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
				store i32 0, i32 addrspace(5)* %x
				store i32 1, i32 addrspace(5)* %y
				store i32 2, i32 addrspace(5)* %z
				store i32 3, i32 addrspace(5)* %w
				%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i32, i32 addrspace(5)* %tmp1
				store i32 %tmp2, i32 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @vector_write_alloca_bitcast(
				; OPT-NOT: alloca
				; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
				; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index
				; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align

				; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
				; GCN-ALLOCA-COUNT-5: buffer_store_dword
				; GCN-ALLOCA: buffer_load_dword

				; GCN-PROMOTE-COUNT-7: v_cndmask

				; GCN-PROMOTE: ScratchSize: 0

				define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
				entry:
				%tmp = alloca [4 x i32], addrspace(5)
				%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
				%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
				%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
				%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
				store i32 0, i32 addrspace(5)* %x
				store i32 0, i32 addrspace(5)* %y
				store i32 0, i32 addrspace(5)* %z
				store i32 0, i32 addrspace(5)* %w
				%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
				store i32 1, i32 addrspace(5)* %tmp1
				%tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
				%tmp3 = load i32, i32 addrspace(5)* %tmp2
				store i32 %tmp3, i32 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @vector_write_read_bitcast_to_float(
				; OPT-NOT: alloca
				; OPT: bb2:
				; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
				; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10
				; OPT: .preheader:
				; OPT: %bc = bitcast <6 x float> %0 to <6 x i32>
				; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20

				; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
				; GCN-ALLOCA: buffer_store_dword

				; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
				; GCN-PROMOTE-COUNT-6: v_cndmask

				; GCN: s_cbranch

				; GCN-ALLOCA: buffer_load_dword

				; GCN-PROMOTE: v_cmp_eq_u16
				; GCN-PROMOTE: v_cndmask
				; GCN-PROMOTE: v_cmp_eq_u16
				; GCN-PROMOTE: v_cndmask
				; GCN-PROMOTE: v_cmp_eq_u16
				; GCN-PROMOTE: v_cndmask
				; GCN-PROMOTE: v_cmp_eq_u16
				; GCN-PROMOTE: v_cndmask
				; GCN-PROMOTE: v_cmp_eq_u16
				; GCN-PROMOTE: v_cndmask

				; GCN-PROMOTE: ScratchSize: 0

				define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) {
				bb:
				%tmp = alloca [6 x float], align 4, addrspace(5)
				%tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)*
				call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
				br label %bb2

				bb2: ; preds = %bb2, %bb
				%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4
				%tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)*
				%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				%tmp8 = trunc i32 %tmp3 to i16
				%tmp9 = urem i16 %tmp8, 6
				%tmp10 = zext i16 %tmp9 to i32
				%tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10
				%tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)*
				store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4
				%tmp13 = add nuw nsw i32 %tmp3, 1
				%tmp14 = icmp eq i32 %tmp13, 1000
				br i1 %tmp14, label %.preheader, label %bb2

				bb15: ; preds = %.preheader
				call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
				ret void

				.preheader: ; preds = %.preheader, %bb2
				%tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
				%tmp17 = trunc i32 %tmp16 to i16
				%tmp18 = urem i16 %tmp17, 6
				%tmp19 = sub nuw nsw i16 5, %tmp18
				%tmp20 = zext i16 %tmp19 to i32
				%tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20
				%tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)*
				%tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4
				%tmp24 = zext i32 %tmp16 to i64
				%tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24
				%tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)*
				store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4
				%tmp27 = add nuw nsw i32 %tmp16, 1
				%tmp28 = icmp eq i32 %tmp27, 1000
				br i1 %tmp28, label %bb15, label %.preheader
				}

				; OPT-LABEL: @vector_write_read_bitcast_to_double(
				; OPT-NOT: alloca
				; OPT: bb2:
				; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
				; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10
				; OPT: .preheader:
				; OPT: %bc = bitcast <6 x double> %0 to <6 x i64>
				; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20

				; TODO: Fix selection to eliminate scratch

				; GCN-LABEL: {{^}}vector_write_read_bitcast_to_double:
				; GCN-COUNT-2: buffer_store_dword

				; GCN: s_cbranch

				; GCN-COUNT-2: buffer_load_dword

				define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) {
				bb:
				%tmp = alloca [6 x double], align 8, addrspace(5)
				%tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)*
				call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
				br label %bb2

				bb2: ; preds = %bb2, %bb
				%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4
				%tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)*
				%tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8
				%tmp8 = trunc i32 %tmp3 to i16
				%tmp9 = urem i16 %tmp8, 6
				%tmp10 = zext i16 %tmp9 to i32
				%tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10
				%tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)*
				store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8
				%tmp13 = add nuw nsw i32 %tmp3, 1
				%tmp14 = icmp eq i32 %tmp13, 1000
				br i1 %tmp14, label %.preheader, label %bb2

				bb15: ; preds = %.preheader
				call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
				ret void

				.preheader: ; preds = %.preheader, %bb2
				%tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
				%tmp17 = trunc i32 %tmp16 to i16
				%tmp18 = urem i16 %tmp17, 6
				%tmp19 = sub nuw nsw i16 5, %tmp18
				%tmp20 = zext i16 %tmp19 to i32
				%tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20
				%tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)*
				%tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8
				%tmp24 = zext i32 %tmp16 to i64
				%tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24
				%tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)*
				store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8
				%tmp27 = add nuw nsw i32 %tmp16, 1
				%tmp28 = icmp eq i32 %tmp27, 1000
				br i1 %tmp28, label %bb15, label %.preheader
				}

				; OPT-LABEL: @vector_write_read_bitcast_to_i64(
				; OPT-NOT: alloca
				; OPT: bb2:
				; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
				; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
				; OPT: .preheader:
				; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18

				; TODO: Fix selection to eliminate scratch

				; GCN-LABEL: {{^}}vector_write_read_bitcast_to_i64:
				; GCN-COUNT-2: buffer_store_dword

				; GCN: s_cbranch

				; GCN-COUNT-2: buffer_load_dword

				define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) {
				bb:
				%tmp = alloca [6 x i64], align 8, addrspace(5)
				%tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)*
				call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
				br label %bb2

				bb2: ; preds = %bb2, %bb
				%tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4
				%tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8
				%tmp7 = trunc i32 %tmp3 to i16
				%tmp8 = urem i16 %tmp7, 6
				%tmp9 = zext i16 %tmp8 to i32
				%tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9
				store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8
				%tmp11 = add nuw nsw i32 %tmp3, 1
				%tmp12 = icmp eq i32 %tmp11, 1000
				br i1 %tmp12, label %.preheader, label %bb2

				bb13: ; preds = %.preheader
				call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
				ret void

				.preheader: ; preds = %.preheader, %bb2
				%tmp14 = phi i32 [ %tmp23, %.preheader ], [ 0, %bb2 ]
				%tmp15 = trunc i32 %tmp14 to i16
				%tmp16 = urem i16 %tmp15, 6
				%tmp17 = sub nuw nsw i16 5, %tmp16
				%tmp18 = zext i16 %tmp17 to i32
				%tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18
				%tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8
				%tmp21 = zext i32 %tmp14 to i64
				%tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21
				store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8
				%tmp23 = add nuw nsw i32 %tmp14, 1
				%tmp24 = icmp eq i32 %tmp23, 1000
				br i1 %tmp24, label %bb13, label %.preheader
				}

				declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)

				declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)