This is an archive of the discontinued LLVM Phabricator instance.

Differential D71929

AMDGPU/GlobalISel: Refine SMRD selection rules
ClosedPublic

Authored by arsenm on Dec 27 2019, 7:52 AM.

Download Raw Diff

Details

Reviewers

nhaehnle
rampitec
tstellar
kerbowa
alex-t

Summary

Fix selecting these for volatile global loads, and ensure the loads
are constant enough.

Diff Detail

Event Timeline

arsenm created this revision.Dec 27 2019, 7:52 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 27 2019, 7:52 AM

Herald added subscribers: Petar.Avramovic, jfb, hiraditya and 8 others. · View Herald Transcript

ping

rampitec added inline comments.Jan 2 2020, 10:49 AM

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
414	'(MMO->getSize() % 4) == 0'? What if we need 6 bytes, like v3i16?

Add test

arsenm marked an inline comment as done.Jan 2 2020, 12:41 PM

arsenm added inline comments.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
414	If it was legal, the alignment is high enough

arsenm added a parent revision: D72099: AMDGPU/GlobalISel: Legalize more odd sized loads.Jan 2 2020, 12:41 PM

rampitec added inline comments.Jan 2 2020, 12:48 PM

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
414	Not necessarily, a vload() may result in underaligned loads. Also we cannot really load a subdword even if aligned, we would need to zero/sign extend it manually.

arsenm marked an inline comment as done.Jan 2 2020, 12:57 PM

arsenm added inline comments.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
414	The legalizer has already run by this point and would have broken down totally illegal loads. If it is sufficiently aligned, extra bits can be loaded. The high bits do not need to be considered here. The G_ZEXTLOAD/G_SEXTLOAD are not legal, so cases where the high bits matter will not reach here

LGTM

This revision is now accepted and ready to land.Jan 2 2020, 1:02 PM

4e972224c476e05af445130e2b208e9819d220a5

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPURegisterBankInfo.cpp

26 lines

test/

CodeGen/

AMDGPU/

GlobalISel/

regbankselect-load.mir

16 lines

regbankselect.mir

166 lines

Diff 235927

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 388 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_s_sendmsghalt: {
const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };		const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));		return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}		}
default:		default:
return RegisterBankInfo::getInstrAlternativeMappings(MI);		return RegisterBankInfo::getInstrAlternativeMappings(MI);
}		}
}		}

		static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
		const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
		return I && I->getMetadata("amdgpu.noclobber");
		}

// FIXME: Returns uniform if there's no source value information. This is		// FIXME: Returns uniform if there's no source value information. This is
// probably wrong.		// probably wrong.
static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {		static bool isScalarLoadLegal(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())		if (!MI.hasOneMemOperand())
return false;		return false;

const MachineMemOperand MMO = MI.memoperands_begin();		const MachineMemOperand MMO = MI.memoperands_begin();
		const unsigned AS = MMO->getAddrSpace();
		const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
		AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;

		// There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&		return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
		rampitecUnsubmitted Not Done Reply Inline Actions '(MMO->getSize() % 4) == 0'? What if we need 6 bytes, like v3i16? rampitec: '(MMO->getSize() % 4) == 0'? What if we need 6 bytes, like v3i16?
		arsenmAuthorUnsubmitted Done Reply Inline Actions If it was legal, the alignment is high enough arsenm: If it was legal, the alignment is high enough
		rampitecUnsubmitted Not Done Reply Inline Actions Not necessarily, a vload() may result in underaligned loads. Also we cannot really load a subdword even if aligned, we would need to zero/sign extend it manually. rampitec: Not necessarily, a vload() may result in underaligned loads. Also we cannot really load a…
		arsenmAuthorUnsubmitted Done Reply Inline Actions The legalizer has already run by this point and would have broken down totally illegal loads. If it is sufficiently aligned, extra bits can be loaded. The high bits do not need to be considered here. The G_ZEXTLOAD/G_SEXTLOAD are not legal, so cases where the high bits matter will not reach here arsenm: The legalizer has already run by this point and would have broken down totally illegal loads.
		// Can't do a scalar atomic load.
		!MMO->isAtomic() &&
		// Don't use scalar loads for volatile accesses to non-constant address
		// spaces.
		(IsConst \|\| !MMO->isVolatile()) &&
		// Memory must be known constant, or not written before this load.
		(IsConst \|\| MMO->isInvariant() \|\| memOpHasNoClobbered(MMO)) &&
AMDGPUInstrInfo::isUniformMMO(MMO);		AMDGPUInstrInfo::isUniformMMO(MMO);
}		}

RegisterBankInfo::InstructionMappings		RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(		AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {		const MachineInstr &MI) const {

const MachineFunction &MF = *MI.getParent()->getParent();		const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();		const MachineRegisterInfo &MRI = MF.getRegInfo();
▲ Show 20 Lines • Show All 91 Lines • ▼ Show 20 Lines	AMDGPURegisterBankInfo::getInstrAlternativeMappings(
case TargetOpcode::G_LOAD:		case TargetOpcode::G_LOAD:
case TargetOpcode::G_ZEXTLOAD:		case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_SEXTLOAD: {		case TargetOpcode::G_SEXTLOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);		unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());		LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
unsigned PtrSize = PtrTy.getSizeInBits();		unsigned PtrSize = PtrTy.getSizeInBits();
unsigned AS = PtrTy.getAddressSpace();		unsigned AS = PtrTy.getAddressSpace();
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());		LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());

if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&		if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&		AS != AMDGPUAS::PRIVATE_ADDRESS) &&
isInstrUniformNonExtLoadAlign4(MI)) {		isScalarLoadLegal(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(		const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(		1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),		{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),		AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
2); // Num Operands		2); // Num Operands
AltMappings.push_back(&SSMapping);		AltMappings.push_back(&SSMapping);
}		}

▲ Show 20 Lines • Show All 1,755 Lines • ▼ Show 20 Lines	AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const ValueMapping *ValMapping;		const ValueMapping *ValMapping;
const ValueMapping *PtrMapping;		const ValueMapping *PtrMapping;

const RegisterBank PtrBank = getRegBank(PtrReg, MRI, TRI);		const RegisterBank PtrBank = getRegBank(PtrReg, MRI, TRI);

if (PtrBank == &AMDGPU::SGPRRegBank &&		if (PtrBank == &AMDGPU::SGPRRegBank &&
(AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&		(AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&		AS != AMDGPUAS::PRIVATE_ADDRESS) &&
isInstrUniformNonExtLoadAlign4(MI)) {		isScalarLoadLegal(MI)) {
// We have a uniform instruction so we want to use an SMRD load		// We have a uniform instruction so we want to use an SMRD load
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);		ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);		PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {		} else {
ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);		ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);		PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}		}

▲ Show 20 Lines • Show All 964 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir

	Show First 20 Lines • Show All 237 Lines • ▼ Show 20 Lines
	---			---
	name: load_global_v8i32_uniform			name: load_global_v8i32_uniform
	legalized: true			legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
	; CHECK-LABEL: name: load_global_v8i32_uniform			; CHECK-LABEL: name: load_global_v8i32_uniform
	; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (load 32, addrspace 1)			; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 1)			%1:_(<8 x s32>) = G_LOAD %0 :: (invariant load 32, addrspace 1)
	...			...

	---			---
	name: load_global_v4i64_uniform			name: load_global_v4i64_uniform
	legalized: true			legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
	; CHECK-LABEL: name: load_global_v4i64_uniform			; CHECK-LABEL: name: load_global_v4i64_uniform
	; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (load 32, addrspace 1)			; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 1)			%1:_(<4 x s64>) = G_LOAD %0 :: (invariant load 32, addrspace 1)
	...			...

	---			---
	name: load_global_v16i32_uniform			name: load_global_v16i32_uniform
	legalized: true			legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
	; CHECK-LABEL: name: load_global_v16i32_uniform			; CHECK-LABEL: name: load_global_v16i32_uniform
	; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (load 64, addrspace 1)			; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 1)			%1:_(<16 x s32>) = G_LOAD %0 :: (invariant load 64, addrspace 1)
	...			...

	---			---
	name: load_global_v8i64_uniform			name: load_global_v8i64_uniform
	legalized: true			legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
	; CHECK-LABEL: name: load_global_v8i64_uniform			; CHECK-LABEL: name: load_global_v8i64_uniform
	; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (load 64, addrspace 1)			; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 1)			%1:_(<8 x s64>) = G_LOAD %0 :: (invariant load 64, addrspace 1)
	...			...

	---			---
	name: load_constant_v8i32_non_uniform			name: load_constant_v8i32_non_uniform
	legalized: true			legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	▲ Show 20 Lines • Show All 405 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
	# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect %s -verify-machineinstrs -o - \| FileCheck %s			# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect %s -verify-machineinstrs -o - \| FileCheck %s

	--- \|			--- \|
	define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void }			define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) {
	define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {			ret void
				}

				define amdgpu_kernel void @load_constant_volatile(i32 addrspace(4)* %ptr0) {
				ret void
				}

				define amdgpu_kernel void @load_global_uniform_invariant(i32 addrspace(1)* %ptr1) {
				%tmp0 = load i32, i32 addrspace(1)* %ptr1
				ret void
				}

				define amdgpu_kernel void @load_global_uniform_noclobber(i32 addrspace(1)* %ptr1) {
				%tmp0 = load i32, i32 addrspace(1)* %ptr1, !amdgpu.noclobber !0
				ret void
				}

				define amdgpu_kernel void @load_global_uniform_variant(i32 addrspace(1)* %ptr1) {
				%tmp0 = load i32, i32 addrspace(1)* %ptr1
				ret void
				}

				define amdgpu_kernel void @load_global_uniform_volatile_invariant(i32 addrspace(1)* %ptr1) {
	%tmp0 = load i32, i32 addrspace(1)* %ptr1			%tmp0 = load i32, i32 addrspace(1)* %ptr1
	ret void			ret void
	}			}

				define amdgpu_kernel void @load_global_uniform_atomic_invariant(i32 addrspace(1)* %ptr1) {
				%tmp0 = load i32, i32 addrspace(1)* %ptr1
				ret void
				}

	define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {			define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
	%tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0			%tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
	%tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0			%tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
	%tmp2 = load i32, i32 addrspace(1)* %tmp1			%tmp2 = load i32, i32 addrspace(1)* %tmp1
	ret void			ret void
	}			}

	define void @non_power_of_2() { ret void }			define void @non_power_of_2() { ret void }

				define amdgpu_kernel void @load_constant_v4i16_from_6_align8(<3 x i16> addrspace(4)* %ptr0) {
				ret void
				}

	declare i32 @llvm.amdgcn.workitem.id.x() #0			declare i32 @llvm.amdgcn.workitem.id.x() #0
	attributes #0 = { nounwind readnone }			attributes #0 = { nounwind readnone }
	...			!0 = !{}

				...
	---			---
	name : load_constant			name: load_constant
	legalized: true			legalized: true

	# CHECK-LABEL: name: load_constant
	# CHECK: registers:
	# CHECK: - { id: 0, class: sgpr, preferred-register: '' }
	# CHECK: - { id: 1, class: sgpr, preferred-register: '' }

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_constant
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
				; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load 4 from %ir.ptr0, addrspace 4)
	%0:_(p4) = COPY $sgpr0_sgpr1			%0:_(p4) = COPY $sgpr0_sgpr1
	%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0)			%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0)
	...			...

	---			---
	name: load_global_uniform			name: load_constant_volatile
	legalized: true			legalized: true

	# CHECK-LABEL: name: load_global_uniform			body: \|
	# CHECK: registers:			bb.0:
	# CHECK: - { id: 0, class: sgpr, preferred-register: '' }			liveins: $sgpr0_sgpr1
	# CHECK: - { id: 1, class: sgpr, preferred-register: '' }			; CHECK-LABEL: name: load_constant_volatile
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
				; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (volatile load 4 from %ir.ptr0, addrspace 4)
				%0:_(p4) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (volatile load 4 from %ir.ptr0)
				...

				---
				name: load_global_uniform_invariant
				legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_global_uniform_invariant
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4 from %ir.ptr1, addrspace 1)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load 4 from %ir.ptr1)
				...

				---
				name: load_global_uniform_noclobber
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_global_uniform_noclobber
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.ptr1, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1)			%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1)
	...			...

	---			---
	name: load_global_non_uniform			name: load_global_uniform_variant
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_global_uniform_variant
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.ptr1, addrspace 1)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1)
				...

				---
				name: load_global_uniform_volatile_invariant
				legalized: true

				body: \|
				bb.0:
				liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_global_uniform_volatile_invariant
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (volatile invariant load 4 from %ir.ptr1, addrspace 1)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (volatile invariant load 4 from %ir.ptr1)
				...

				---
				name: load_global_uniform_atomic_invariant
	legalized: true			legalized: true

	# CHECK-LABEL: name: load_global_non_uniform			body: \|
	# CHECK: registers:			bb.0:
	# CHECK: - { id: 0, class: sgpr, preferred-register: '' }			liveins: $sgpr0_sgpr1
	# CHECK: - { id: 1, class: vgpr, preferred-register: '' }			; CHECK-LABEL: name: load_global_uniform_atomic_invariant
	# CHECK: - { id: 2, class: vgpr, preferred-register: '' }			; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load acquire 4 from %ir.ptr1, addrspace 1)
				%0:_(p1) = COPY $sgpr0_sgpr1
				%1:_(s32) = G_LOAD %0 :: (invariant load acquire 4 from %ir.ptr1)
				...

				---
				name: load_global_non_uniform
				legalized: true

	body: \|			body: \|
	bb.0:			bb.0:
	liveins: $sgpr0_sgpr1			liveins: $sgpr0_sgpr1
				; CHECK-LABEL: name: load_global_non_uniform
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
				; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
				; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.tmp1, addrspace 1)
	%0:_(p1) = COPY $sgpr0_sgpr1			%0:_(p1) = COPY $sgpr0_sgpr1
	%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1)			%1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1)
	...			...

	---			---
	name: non_power_of_2			name: non_power_of_2
	legalized: true			legalized: true

	# CHECK-LABEL: name: non_power_of_2
	# CHECK: [[S448:%[0-9]+]]:sgpr(s448) = G_IMPLICIT_DEF
	# CHECK: sgpr(s32) = G_EXTRACT [[S448]](s448), 0

	body: \|			body: \|
	bb.0:			bb.0:
				; CHECK-LABEL: name: non_power_of_2
				; CHECK: [[DEF:%[0-9]+]]:sgpr(s448) = G_IMPLICIT_DEF
				; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(s32) = G_EXTRACT [[DEF]](s448), 0
				; CHECK: $sgpr0 = COPY [[EXTRACT]](s32)
				; CHECK: SI_RETURN_TO_EPILOG $sgpr0
	%0:_(s448) = G_IMPLICIT_DEF			%0:_(s448) = G_IMPLICIT_DEF
	%1:_(s32) = G_EXTRACT %0:_(s448), 0			%1:_(s32) = G_EXTRACT %0:_(s448), 0
	$sgpr0 = COPY %1:_(s32)			$sgpr0 = COPY %1:_(s32)
	SI_RETURN_TO_EPILOG $sgpr0			SI_RETURN_TO_EPILOG $sgpr0
	...			...

				---
				name: load_constant_v4i16_from_6_align8
				legalized: true

				body: \|
				bb.0:
				; CHECK-LABEL: name: load_constant_v4i16_from_6_align8
				; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
				; CHECK: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6 from %ir.ptr0, align 8, addrspace 4)
				%0:_(p4) = COPY $sgpr0_sgpr1
				%1:_(<4 x s16>) = G_LOAD %0 :: (load 6 from %ir.ptr0, align 8, addrspace 4)

				...