Diff 338904

llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp

	Show First 20 Lines • Show All 486 Lines • ▼ Show 20 Lines
	}			}

	MachineInstrBuilder MachineIRBuilder::buildZExtInReg(const DstOp &Res,			MachineInstrBuilder MachineIRBuilder::buildZExtInReg(const DstOp &Res,
	const SrcOp &Op,			const SrcOp &Op,
	int64_t ImmOp) {			int64_t ImmOp) {
	LLT ResTy = Res.getLLTTy(*getMRI());			LLT ResTy = Res.getLLTTy(*getMRI());
	auto Mask = buildConstant(			auto Mask = buildConstant(
	ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp));			ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp));
	return buildAnd(ResTy, Op, Mask);			return buildAnd(Res, Op, Mask);
				arsenmUnsubmitted Not Done Reply Inline Actions Should split this into a separate patch arsenm: Should split this into a separate patch
	}			}

	MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst,			MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst,
	const SrcOp &Src) {			const SrcOp &Src) {
	LLT SrcTy = Src.getLLTTy(*getMRI());			LLT SrcTy = Src.getLLTTy(*getMRI());
	LLT DstTy = Dst.getLLTTy(*getMRI());			LLT DstTy = Dst.getLLTTy(*getMRI());
	if (SrcTy == DstTy)			if (SrcTy == DstTy)
	return buildCopy(Dst, Src);			return buildCopy(Dst, Src);
	▲ Show 20 Lines • Show All 741 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 437 Lines • ▼ Show 20 Lines
static bool isScalarLoadLegal(const MachineInstr &MI) {		static bool isScalarLoadLegal(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())		if (!MI.hasOneMemOperand())
return false;		return false;

const MachineMemOperand MMO = MI.memoperands_begin();		const MachineMemOperand MMO = MI.memoperands_begin();
const unsigned AS = MMO->getAddrSpace();		const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|		const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;		AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
		// Require 4-byte alignment.
// There are no extending SMRD/SMEM loads, and they require 4-byte alignment.		return MMO->getAlign() >= Align(4) &&
		arsenmUnsubmitted Not Done Reply Inline Actions There's no point in checking the size anymore arsenm: There's no point in checking the size anymore
		arsenmUnsubmitted Not Done Reply Inline Actions The old comment was more informative arsenm: The old comment was more informative
return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
// Can't do a scalar atomic load.		// Can't do a scalar atomic load.
!MMO->isAtomic() &&		!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address		// Don't use scalar loads for volatile accesses to non-constant address
// spaces.		// spaces.
(IsConst \|\| !MMO->isVolatile()) &&		(IsConst \|\| !MMO->isVolatile()) &&
// Memory must be known constant, or not written before this load.		// Memory must be known constant, or not written before this load.
(IsConst \|\| MMO->isInvariant() \|\| memOpHasNoClobbered(MMO)) &&		(IsConst \|\| MMO->isInvariant() \|\| memOpHasNoClobbered(MMO)) &&
AMDGPUInstrInfo::isUniformMMO(MMO);		AMDGPUInstrInfo::isUniformMMO(MMO);
▲ Show 20 Lines • Show All 686 Lines • ▼ Show 20 Lines	bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();		Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);		const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();		unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;		const unsigned MaxNonSmrdLoadSize = 128;

const RegisterBank *PtrBank =		const RegisterBank *PtrBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;		OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
if (PtrBank == &AMDGPU::SGPRRegBank) {		if (PtrBank == &AMDGPU::SGPRRegBank) {
// If the pointer is an SGPR, we ordinarily have nothing to do.		// There are some special cases that we need to look at for 32 bit and 96
if (LoadSize != 96)		// bit SGPR loads otherwise we have nothing to do.
		if (LoadSize != 32 && LoadSize != 96)
return false;		return false;

MachineMemOperand MMO = MI.memoperands_begin();		MachineMemOperand MMO = MI.memoperands_begin();
		const unsigned MemSize = 8 * MMO->getSize();
		// Scalar loads of size 8 or 16 bit with proper alignment may be widen to 32
		arsenmUnsubmitted Not Done Reply Inline Actions Grammar, widen->widened arsenm: Grammar, widen->widened
		// bit. Check to see if we need to widen the memory access, 8 or 16 bit
		arsenmUnsubmitted Not Done Reply Inline Actions I don't understand this getSize check. It's illegal for the memory size to exceed the loaded type arsenm: I don't understand this getSize check. It's illegal for the memory size to exceed the loaded…
		// scalar loads should have a load size of 32 but memory access size of less
		arsenmUnsubmitted Not Done Reply Inline Actions You could also handle G_SEXTLOAD and G_ZEXTLOAD, but would require inserting some instructions to set the high bits appropriately. This also makes me think this should be done earlier arsenm: You could also handle G_SEXTLOAD and G_ZEXTLOAD, but would require inserting some instructions…
		// than 32.
		if (LoadSize == 32 &&
		(MemSize == 32 \|\| LoadTy.isVector() \|\| !isScalarLoadLegal(MI)))
		return false;

Register PtrReg = MI.getOperand(1).getReg();		Register PtrReg = MI.getOperand(1).getReg();
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).

ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);		ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);		MachineIRBuilder B(MI, O);

		if (LoadSize == 32) {
		// A scalar load was legally widen to 32 bit but the memory access has not
		// been widen yet to the correct size yet so widen it here to 32 bit.
		arsenmUnsubmitted Not Done Reply Inline Actions This comment is worded confusingly so it doesn't explain what is going on. We are widening the memory access, not the result register type arsenm: This comment is worded confusingly so it doesn't explain what is going on. We are widening the…
		if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
		// Must extend the sign bit into higher bits for a G_SEXTLOAD
		const LLT S32 = LLT::scalar(32);
		arsenmUnsubmitted Not Done Reply Inline Actions Should just hoist the S32 definition instead of repeating it arsenm: Should just hoist the S32 definition instead of repeating it
		auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
		B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
		} else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
		// Must extend zero into higher bits with an AND for a G_ZEXTLOAD
		const LLT S32 = LLT::scalar(32);
		auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
		B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
		foadUnsubmitted Not Done Reply Inline Actions Use B.buildZExtInReg. foad: Use B.buildZExtInReg.
		vangthaoAuthorUnsubmitted Done Reply Inline Actions I attempted to use B.buildZExtInReg but this function creates a new virtual register as the destination instead of using MI.getOperand(0)'s register when passing it as the first argument. This is unlike buildSExtInReg which uses the first argument as the destination. vangthao: I attempted to use B.buildZExtInReg but this function creates a new virtual register as the…
		foadUnsubmitted Not Done Reply Inline Actions Ah, that's a bug in the implementation: --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -492,7 +492,7 @@ MachineInstrBuilder MachineIRBuilder::buildZExtInReg(const DstOp &Res, LLT ResTy = Res.getLLTTy(getMRI()); auto Mask = buildConstant( ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp)); - return buildAnd(ResTy, Op, Mask); + return buildAnd(Res, Op, Mask); } MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst, foad:* Ah, that's a bug in the implementation: ``` --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.
		arsenmUnsubmitted Not Done Reply Inline Actions S32.getScalarSizeInBits() is a complicated way of just saying 32 arsenm: S32.getScalarSizeInBits() is a complicated way of just saying 32
		arsenmUnsubmitted Not Done Reply Inline Actions MachineIRBuilder has a buildZExtInReg helper to figure out the mask and create the and for you arsenm: MachineIRBuilder has a buildZExtInReg helper to figure out the mask and create the and for you
		vangthaoAuthorUnsubmitted Done Reply Inline Actions I used this previously but there was an issue with the implementation in buildZExtInReg where it would create a new destination register instead of using the original destination register passed to it. Jay mentioned this bug in a previous comment. vangthao: I used this previously but there was an issue with the implementation in buildZExtInReg where…
		arsenmUnsubmitted Not Done Reply Inline Actions Yes, buildZExtInReg is broken and needs to be fixed. You should fix it and continue using it arsenm: Yes, buildZExtInReg is broken and needs to be fixed. You should fix it and continue using it
		} else
		// We do not need to touch the higher bits for regular loads.
		B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
		} else {
		// 96-bit loads are only available for vector loads. We need to split this
		arsenmUnsubmitted Not Done Reply Inline Actions Needs to elaborate more on when this is valid arsenm: Needs to elaborate more on when this is valid
		// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
if (MMO->getAlign() < Align(16)) {		if (MMO->getAlign() < Align(16)) {
LLT Part64, Part32;		LLT Part64, Part32;
std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);		std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);		auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);		auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);

auto Undef = B.buildUndef(LoadTy);		auto Undef = B.buildUndef(LoadTy);
auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);		auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);		B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
} else {		} else {
LLT WiderTy = widen96To128(LoadTy);		LLT WiderTy = widen96To128(LoadTy);
auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);		auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
B.buildExtract(MI.getOperand(0), WideLoad, 0);		B.buildExtract(MI.getOperand(0), WideLoad, 0);
}		}
		arsenmUnsubmitted Not Done Reply Inline Actions Should use trunc. I'm trying to eliminate G_EXTRACT usage arsenm: Should use trunc. I'm trying to eliminate G_EXTRACT usage
		}

MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}

// 128-bit loads are supported for all instruction types.		// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)		if (LoadSize <= MaxNonSmrdLoadSize)
return false;		return false;
▲ Show 20 Lines • Show All 3,191 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck --check-prefix=CI %s			; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck --check-prefix=CI %s
	; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck --check-prefix=VI %s			; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck --check-prefix=VI %s

	define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {			define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
	; CI-LABEL: frem_f16:			; CI-LABEL: frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0			; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
	; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0			; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
	; CI-NEXT: v_rcp_f32_e32 v4, v2			; CI-NEXT: v_rcp_f32_e32 v4, v2
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
	; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0			; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
	; CI-NEXT: v_fma_f32 v4, v5, v4, v4			; CI-NEXT: v_fma_f32 v4, v5, v4, v4
	; CI-NEXT: v_mul_f32_e32 v5, v3, v4			; CI-NEXT: v_mul_f32_e32 v5, v3, v4
	; CI-NEXT: v_fma_f32 v6, -v2, v5, v3			; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
	; CI-NEXT: v_fma_f32 v5, v6, v4, v5			; CI-NEXT: v_fma_f32 v5, v6, v4, v5
	; CI-NEXT: v_fma_f32 v2, -v2, v5, v3			; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
	; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5			; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: s_mov_b32 s6, -1
				; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0			; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: frem_f16:			; VI-LABEL: frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_mov_b32_e32 v1, s1
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_rcp_f32_e32 v2, v2
	; VI-NEXT: s_waitcnt vmcnt(1)			; VI-NEXT: v_mul_f32_e32 v0, v0, v2
	; VI-NEXT: v_cvt_f32_f16_e32 v1, v2			; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; VI-NEXT: s_waitcnt vmcnt(0)			; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
	; VI-NEXT: v_cvt_f32_f16_e32 v3, v0			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: v_rcp_f32_e32 v3, v3			; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
	; VI-NEXT: v_mul_f32_e32 v1, v1, v3
	; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
	; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem half %r0, %r1			%r2 = frem half %r0, %r1
	store half %r2, half addrspace(1)* %out, align 4			store half %r2, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {			define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
	; CI-LABEL: fast_frem_f16:			; CI-LABEL: fast_frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_mov_b32 s6, -1
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: v_rcp_f32_e32 v2, v1			; CI-NEXT: v_rcp_f32_e32 v2, v1
	; CI-NEXT: v_mul_f32_e32 v2, v0, v2			; CI-NEXT: v_mul_f32_e32 v2, v0, v2
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: fast_frem_f16:			; VI-LABEL: fast_frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_mov_b32_e32 v1, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_rcp_f16_e32 v0, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0			; VI-NEXT: v_mul_f16_e32 v0, s0, v0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
	; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_rcp_f16_e32 v1, v0
	; VI-NEXT: v_mul_f16_e32 v1, v2, v1
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem fast half %r0, %r1			%r2 = frem fast half %r0, %r1
	store half %r2, half addrspace(1)* %out, align 4			store half %r2, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #1 {			define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #1 {
	; CI-LABEL: unsafe_frem_f16:			; CI-LABEL: unsafe_frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_mov_b32 s6, -1
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: v_rcp_f32_e32 v2, v1			; CI-NEXT: v_rcp_f32_e32 v2, v1
	; CI-NEXT: v_mul_f32_e32 v2, v0, v2			; CI-NEXT: v_mul_f32_e32 v2, v0, v2
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: unsafe_frem_f16:			; VI-LABEL: unsafe_frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_mov_b32_e32 v1, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_rcp_f16_e32 v0, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0			; VI-NEXT: v_mul_f16_e32 v0, s0, v0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
	; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_rcp_f16_e32 v1, v0
	; VI-NEXT: v_mul_f16_e32 v1, v2, v1
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem half %r0, %r1			%r2 = frem half %r0, %r1
	▲ Show 20 Lines • Show All 956 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
				# RUN: llc -global-isel -march=amdgcn -mcpu=fiji -run-pass=regbankselect -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX8 %s
				# RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX9 %s
				# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX10 %s
				---
				name: i8_to_i32_spgr_align8
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: i8_to_i32_spgr_align8
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
				; GFX9-LABEL: name: i8_to_i32_spgr_align8
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
				; GFX10-LABEL: name: i8_to_i32_spgr_align8
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 1, align 8, addrspace 4 )
				...
				---
				name: i8_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: i8_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9-LABEL: name: i8_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10-LABEL: name: i8_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 1, align 4, addrspace 4 )
				...
				---
				name: i16_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: i16_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9-LABEL: name: i16_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10-LABEL: name: i16_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 2, align 4, addrspace 4 )
				...
				---
				name: sextload_i8_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: sextload_i8_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
				; GFX9-LABEL: name: sextload_i8_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
				; GFX10-LABEL: name: sextload_i8_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4 )
				...
				---
				name: sextload_i16_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: sextload_i16_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
				; GFX9-LABEL: name: sextload_i16_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
				; GFX10-LABEL: name: sextload_i16_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4 )
				...

				---
				name: zextload_i8_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: zextload_i8_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
				; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				; GFX9-LABEL: name: zextload_i8_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
				; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				; GFX10-LABEL: name: zextload_i8_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
				; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4 )
				...
				---
				name: zextload_i16_to_i32_spgr_align4
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: zextload_i16_to_i32_spgr_align4
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
				; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				; GFX9-LABEL: name: zextload_i16_to_i32_spgr_align4
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
				; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				; GFX10-LABEL: name: zextload_i16_to_i32_spgr_align4
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
				; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
				; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4 )
				...
				---
				name: i8_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: i8_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX9-LABEL: name: i8_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX10-LABEL: name: i8_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 1, align 2, addrspace 4 )
				...

				---
				name: i16_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: i16_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX9-LABEL: name: i16_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX10-LABEL: name: i16_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 2, align 2, addrspace 4 )
				...
				---
				name: sext_i8_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: sext_i8_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX9-LABEL: name: sext_i8_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX10-LABEL: name: sext_i8_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4 )
				...
				---
				name: sext_i16_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: sext_i16_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX9-LABEL: name: sext_i16_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX10-LABEL: name: sext_i16_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4 )
				...
				---
				arsenmUnsubmitted Not Done Reply Inline Actions Should add some dummy uses, e.g. S_ENDPGM 0, implicit %1 Just in case regbankselect decides to start dropping dead uses someday arsenm: Should add some dummy uses, e.g. S_ENDPGM 0, implicit %1 Just in case regbankselect decides to…
				name: zext_i8_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: zext_i8_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX9-LABEL: name: zext_i8_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				; GFX10-LABEL: name: zext_i8_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4 )
				...
				---
				name: zext_i16_to_i32_vgpr_align2
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; GFX8-LABEL: name: zext_i16_to_i32_vgpr_align2
				; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX9-LABEL: name: zext_i16_to_i32_vgpr_align2
				; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				; GFX10-LABEL: name: zext_i16_to_i32_vgpr_align2
				; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4 )
				...
				arsenmUnsubmitted Not Done Reply Inline Actions Maybe add a negative case for the memory not being invariant arsenm: Maybe add a negative case for the memory not being invariant

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX8 %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX9 %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s \| FileCheck -check-prefix=GFX10 %s

				define amdgpu_kernel void @constant_load_i8_align4(i8 addrspace (1)* %out, i8 addrspace(4)* %in) #0 {
				; GFX8-LABEL: constant_load_i8_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_byte v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_load_i8_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_load_i8_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%ld = load i8, i8 addrspace(4)* %in, align 4
				store i8 %ld, i8 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @constant_load_i16_align4(i16 addrspace (1)* %out, i16 addrspace(4)* %in) #0 {
				; GFX8-LABEL: constant_load_i16_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_short v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_load_i16_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_short v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_load_i16_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_short v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%ld = load i16, i16 addrspace(4)* %in, align 4
				store i16 %ld, i16 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @sextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				; GFX8-LABEL: sextload_i8_to_i32_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_sext_i32_i8 s2, s2
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_dword v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: sextload_i8_to_i32_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_sext_i32_i8 s2, s2
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: sextload_i8_to_i32_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_sext_i32_i8 s2, s2
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i8, i8 addrspace(1)* %in, align 4
				%sext = sext i8 %load to i32
				store i32 %sext, i32 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @sextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
				; GFX8-LABEL: sextload_i16_to_i32_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_sext_i32_i16 s2, s2
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_dword v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: sextload_i16_to_i32_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_sext_i32_i16 s2, s2
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: sextload_i16_to_i32_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_sext_i32_i16 s2, s2
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i16, i16 addrspace(1)* %in, align 4
				%sext = sext i16 %load to i32
				store i32 %sext, i32 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @zextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				; GFX8-LABEL: zextload_i8_to_i32_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_and_b32 s2, s2, 0xff
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_dword v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: zextload_i8_to_i32_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_and_b32 s2, s2, 0xff
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: zextload_i8_to_i32_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_and_b32 s2, s2, 0xff
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i8, i8 addrspace(1)* %in, align 4
				%zext = zext i8 %load to i32
				store i32 %zext, i32 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @zextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
				; GFX8-LABEL: zextload_i16_to_i32_align4:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
				; GFX8-NEXT: v_mov_b32_e32 v2, s2
				; GFX8-NEXT: flat_store_dword v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: zextload_i16_to_i32_align4:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: zextload_i16_to_i32_align4:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
				; GFX10-NEXT: v_mov_b32_e32 v0, s2
				; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i16, i16 addrspace(1)* %in, align 4
				%zext = zext i16 %load to i32
				store i32 %zext, i32 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				; GFX8-LABEL: constant_load_i8_align2:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v0, s2
				; GFX8-NEXT: v_mov_b32_e32 v1, s3
				; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt vmcnt(0)
				; GFX8-NEXT: flat_store_byte v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_load_i8_align2:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v0, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_load_i8_align2:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v0, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i8, i8 addrspace(1)* %in, align 2
				store i8 %load, i8 addrspace(1)* %out, align 2
				ret void
				}

				define amdgpu_kernel void @constant_load_i16_align2(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
				; GFX8-LABEL: constant_load_i16_align2:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v0, s2
				; GFX8-NEXT: v_mov_b32_e32 v1, s3
				; GFX8-NEXT: flat_load_ushort v2, v[0:1]
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_waitcnt vmcnt(0)
				; GFX8-NEXT: flat_store_short v[0:1], v2
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_load_i16_align2:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v0, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_short v0, v1, s[0:1]
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_load_i16_align2:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v0, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_short v0, v1, s[0:1]
				; GFX10-NEXT: s_endpgm
				%load = load i16, i16 addrspace(1)* %in, align 2
				store i16 %load, i16 addrspace(1)* %out, align 2
				ret void
				}

				define amdgpu_kernel void @constant_sextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				; GFX8-LABEL: constant_sextload_i8_align2:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v0, s2
				; GFX8-NEXT: v_mov_b32_e32 v1, s3
				; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_add_u32 s0, s0, 2
				; GFX8-NEXT: s_addc_u32 s1, s1, 0
				; GFX8-NEXT: s_waitcnt vmcnt(0)
				; GFX8-NEXT: flat_store_short v[0:1], v2
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: flat_store_short v[0:1], v3
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_sextload_i8_align2:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v0, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
				; GFX9-NEXT: global_store_short v0, v1, s[0:1]
				; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_sextload_i8_align2:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v0, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
				; GFX10-NEXT: global_store_short v0, v1, s[0:1]
				; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
				; GFX10-NEXT: s_endpgm
				%load = load i8, i8 addrspace(1)* %in, align 2
				%sextload = sext i8 %load to i32
				store i32 %sextload, i32 addrspace(1)* %out, align 2
				ret void
				}

				define amdgpu_kernel void @constant_zextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				; GFX8-LABEL: constant_zextload_i8_align2:
				; GFX8: ; %bb.0:
				; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX8-NEXT: s_waitcnt lgkmcnt(0)
				; GFX8-NEXT: v_mov_b32_e32 v0, s2
				; GFX8-NEXT: v_mov_b32_e32 v1, s3
				; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: s_add_u32 s0, s0, 2
				; GFX8-NEXT: s_addc_u32 s1, s1, 0
				; GFX8-NEXT: s_waitcnt vmcnt(0)
				; GFX8-NEXT: flat_store_short v[0:1], v2
				; GFX8-NEXT: v_mov_b32_e32 v0, s0
				; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
				; GFX8-NEXT: v_mov_b32_e32 v1, s1
				; GFX8-NEXT: flat_store_short v[0:1], v3
				; GFX8-NEXT: s_endpgm
				;
				; GFX9-LABEL: constant_zextload_i8_align2:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: v_mov_b32_e32 v0, 0
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
				; GFX9-NEXT: global_store_short v0, v1, s[0:1]
				; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: constant_zextload_i8_align2:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX10-NEXT: v_mov_b32_e32 v0, 0
				; GFX10-NEXT: s_waitcnt lgkmcnt(0)
				; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
				; GFX10-NEXT: global_store_short v0, v1, s[0:1]
				; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
				; GFX10-NEXT: s_endpgm
				%load = load i8, i8 addrspace(1)* %in, align 2
				%zextload = zext i8 %load to i32
				store i32 %zextload, i32 addrspace(1)* %out, align 2
				ret void
				}

				attributes #0 = { nounwind }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 338904

llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loadsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 338904

llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
ClosedPublic