Diff 337299

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

//===- AMDGPURegisterBankInfo.cpp -------------------------------- C++ --==//		//===- AMDGPURegisterBankInfo.cpp -------------------------------- C++ --==//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
/// \file		/// \file
▲ Show 20 Lines • Show All 430 Lines • ▼ Show 20 Lines	static bool isScalarLoadLegal(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())		if (!MI.hasOneMemOperand())
return false;		return false;

const MachineMemOperand MMO = MI.memoperands_begin();		const MachineMemOperand MMO = MI.memoperands_begin();
const unsigned AS = MMO->getAddrSpace();		const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|		const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;		AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;

// There are no extending SMRD/SMEM loads, and they require 4-byte alignment.		// Require 4-byte alignment.
return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&		return MMO->getSize() >= 1 && MMO->getAlign() >= Align(4) &&
		arsenmUnsubmitted Not Done Reply Inline Actions There's no point in checking the size anymore arsenm: There's no point in checking the size anymore
		arsenmUnsubmitted Not Done Reply Inline Actions The old comment was more informative arsenm: The old comment was more informative
// Can't do a scalar atomic load.		// Can't do a scalar atomic load.
!MMO->isAtomic() &&		!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address		// Don't use scalar loads for volatile accesses to non-constant address
// spaces.		// spaces.
(IsConst \|\| !MMO->isVolatile()) &&		(IsConst \|\| !MMO->isVolatile()) &&
// Memory must be known constant, or not written before this load.		// Memory must be known constant, or not written before this load.
(IsConst \|\| MMO->isInvariant() \|\| memOpHasNoClobbered(MMO)) &&		(IsConst \|\| MMO->isInvariant() \|\| memOpHasNoClobbered(MMO)) &&
AMDGPUInstrInfo::isUniformMMO(MMO);		AMDGPUInstrInfo::isUniformMMO(MMO);
▲ Show 20 Lines • Show All 686 Lines • ▼ Show 20 Lines	bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();		Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);		const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();		unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;		const unsigned MaxNonSmrdLoadSize = 128;

const RegisterBank *PtrBank =		const RegisterBank *PtrBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;		OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
if (PtrBank == &AMDGPU::SGPRRegBank) {		if (PtrBank == &AMDGPU::SGPRRegBank) {
// If the pointer is an SGPR, we ordinarily have nothing to do.		MachineMemOperand MMO = MI.memoperands_begin();
if (LoadSize != 96)		// There are some special cases that we need to look at for 32 bit and 96
		// bit SGPR loads otherwise we have nothing to do.
		if (LoadSize != 32 && LoadSize != 96)
		return false;

		// Scalar loads of size 8 or 16 bit with proper alignment may be widen to 32
		// bit.
		if (LoadSize == 32 && (MMO->getSize() >= 4 \|\|
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - if (LoadSize == 32 && (MMO->getSize() >= 4 \|\| - MI.getOpcode() != AMDGPU::G_LOAD \|\| - LoadTy.isVector() \|\| - !isScalarLoadLegal(MI))) + if (LoadSize == 32 && + (MMO->getSize() >= 4 \|\| MI.getOpcode() != AMDGPU::G_LOAD \|\| + LoadTy.isVector() \|\| !isScalarLoadLegal(MI))) Lint: Pre-merge checks: clang-format: please reformat the code ``` - if (LoadSize == 32 && (MMO->getSize() >= 4 \|\|…
		arsenmUnsubmitted Not Done Reply Inline Actions I don't understand this getSize check. It's illegal for the memory size to exceed the loaded type arsenm: I don't understand this getSize check. It's illegal for the memory size to exceed the loaded…
		MI.getOpcode() != AMDGPU::G_LOAD \|\|
		arsenmUnsubmitted Not Done Reply Inline Actions You could also handle G_SEXTLOAD and G_ZEXTLOAD, but would require inserting some instructions to set the high bits appropriately. This also makes me think this should be done earlier arsenm: You could also handle G_SEXTLOAD and G_ZEXTLOAD, but would require inserting some instructions…
		LoadTy.isVector() \|\|
		!isScalarLoadLegal(MI)))
return false;		return false;

MachineMemOperand MMO = MI.memoperands_begin();
Register PtrReg = MI.getOperand(1).getReg();		Register PtrReg = MI.getOperand(1).getReg();
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).

		arsenmUnsubmitted Not Done Reply Inline Actions Grammar, widen->widened arsenm: Grammar, widen->widened
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);		ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);		MachineIRBuilder B(MI, O);

		if (LoadSize == 32) {
		// Widen memory access for 8 or 16 bit scalar loads to 32 bit.
		arsenmUnsubmitted Not Done Reply Inline Actions Needs to elaborate more on when this is valid arsenm: Needs to elaborate more on when this is valid
		arsenmUnsubmitted Not Done Reply Inline Actions This comment is worded confusingly so it doesn't explain what is going on. We are widening the memory access, not the result register type arsenm: This comment is worded confusingly so it doesn't explain what is going on. We are widening the…
		LLT WiderTy = LLT::scalar(32);
		auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
		B.buildExtract(MI.getOperand(0), WideLoad, 0);
		arsenmUnsubmitted Not Done Reply Inline Actions Should use trunc. I'm trying to eliminate G_EXTRACT usage arsenm: Should use trunc. I'm trying to eliminate G_EXTRACT usage
		arsenmUnsubmitted Not Done Reply Inline Actions Should just hoist the S32 definition instead of repeating it arsenm: Should just hoist the S32 definition instead of repeating it
		} else {
		// 96-bit loads are only available for vector loads. We need to split this
		// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
if (MMO->getAlign() < Align(16)) {		if (MMO->getAlign() < Align(16)) {
LLT Part64, Part32;		LLT Part64, Part32;
std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);		std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);		auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
		foadUnsubmitted Not Done Reply Inline Actions Use B.buildZExtInReg. foad: Use B.buildZExtInReg.
		arsenmUnsubmitted Not Done Reply Inline Actions S32.getScalarSizeInBits() is a complicated way of just saying 32 arsenm: S32.getScalarSizeInBits() is a complicated way of just saying 32
		arsenmUnsubmitted Not Done Reply Inline Actions MachineIRBuilder has a buildZExtInReg helper to figure out the mask and create the and for you arsenm: MachineIRBuilder has a buildZExtInReg helper to figure out the mask and create the and for you
		vangthaoAuthorUnsubmitted Done Reply Inline Actions I attempted to use B.buildZExtInReg but this function creates a new virtual register as the destination instead of using MI.getOperand(0)'s register when passing it as the first argument. This is unlike buildSExtInReg which uses the first argument as the destination. vangthao: I attempted to use B.buildZExtInReg but this function creates a new virtual register as the…
		foadUnsubmitted Not Done Reply Inline Actions Ah, that's a bug in the implementation: --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -492,7 +492,7 @@ MachineInstrBuilder MachineIRBuilder::buildZExtInReg(const DstOp &Res, LLT ResTy = Res.getLLTTy(getMRI()); auto Mask = buildConstant( ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp)); - return buildAnd(ResTy, Op, Mask); + return buildAnd(Res, Op, Mask); } MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst, foad:* Ah, that's a bug in the implementation: ``` --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.
		vangthaoAuthorUnsubmitted Done Reply Inline Actions I used this previously but there was an issue with the implementation in buildZExtInReg where it would create a new destination register instead of using the original destination register passed to it. Jay mentioned this bug in a previous comment. vangthao: I used this previously but there was an issue with the implementation in buildZExtInReg where…
		arsenmUnsubmitted Not Done Reply Inline Actions Yes, buildZExtInReg is broken and needs to be fixed. You should fix it and continue using it arsenm: Yes, buildZExtInReg is broken and needs to be fixed. You should fix it and continue using it
auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);		auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);

auto Undef = B.buildUndef(LoadTy);		auto Undef = B.buildUndef(LoadTy);
auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);		auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);		B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
} else {		} else {
LLT WiderTy = widen96To128(LoadTy);		LLT WiderTy = widen96To128(LoadTy);
auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);		auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
B.buildExtract(MI.getOperand(0), WideLoad, 0);		B.buildExtract(MI.getOperand(0), WideLoad, 0);
}		}
		}

MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}

// 128-bit loads are supported for all instruction types.		// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)		if (LoadSize <= MaxNonSmrdLoadSize)
return false;		return false;
▲ Show 20 Lines • Show All 3,191 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck --check-prefix=CI %s			; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck --check-prefix=CI %s
	; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck --check-prefix=VI %s			; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck --check-prefix=VI %s

	define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {			define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
	; CI-LABEL: frem_f16:			; CI-LABEL: frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0			; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
	; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0			; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
	; CI-NEXT: v_rcp_f32_e32 v4, v2			; CI-NEXT: v_rcp_f32_e32 v4, v2
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
	; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0			; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
	; CI-NEXT: v_fma_f32 v4, v5, v4, v4			; CI-NEXT: v_fma_f32 v4, v5, v4, v4
	; CI-NEXT: v_mul_f32_e32 v5, v3, v4			; CI-NEXT: v_mul_f32_e32 v5, v3, v4
	; CI-NEXT: v_fma_f32 v6, -v2, v5, v3			; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
	; CI-NEXT: v_fma_f32 v5, v6, v4, v5			; CI-NEXT: v_fma_f32 v5, v6, v4, v5
	; CI-NEXT: v_fma_f32 v2, -v2, v5, v3			; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
	; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5			; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: s_mov_b32 s6, -1
				; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0			; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0			; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: frem_f16:			; VI-LABEL: frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_mov_b32_e32 v1, s1
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_rcp_f32_e32 v2, v2
	; VI-NEXT: s_waitcnt vmcnt(1)			; VI-NEXT: v_mul_f32_e32 v0, v0, v2
	; VI-NEXT: v_cvt_f32_f16_e32 v1, v2			; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; VI-NEXT: s_waitcnt vmcnt(0)			; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
	; VI-NEXT: v_cvt_f32_f16_e32 v3, v0			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: v_rcp_f32_e32 v3, v3			; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
	; VI-NEXT: v_mul_f32_e32 v1, v1, v3
	; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
	; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem half %r0, %r1			%r2 = frem half %r0, %r1
	store half %r2, half addrspace(1)* %out, align 4			store half %r2, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {			define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
	; CI-LABEL: fast_frem_f16:			; CI-LABEL: fast_frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_mov_b32 s6, -1
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: v_rcp_f32_e32 v2, v1			; CI-NEXT: v_rcp_f32_e32 v2, v1
	; CI-NEXT: v_mul_f32_e32 v2, v0, v2			; CI-NEXT: v_mul_f32_e32 v2, v0, v2
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: fast_frem_f16:			; VI-LABEL: fast_frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_mov_b32_e32 v1, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_rcp_f16_e32 v0, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0			; VI-NEXT: v_mul_f16_e32 v0, s0, v0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
	; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_rcp_f16_e32 v1, v0
	; VI-NEXT: v_mul_f16_e32 v1, v2, v1
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem fast half %r0, %r1			%r2 = frem fast half %r0, %r1
	store half %r2, half addrspace(1)* %out, align 4			store half %r2, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #1 {			define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #1 {
	; CI-LABEL: unsafe_frem_f16:			; CI-LABEL: unsafe_frem_f16:
	; CI: ; %bb.0:			; CI: ; %bb.0:
	; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd			; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
	; CI-NEXT: s_mov_b32 s10, -1			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: s_mov_b32 s11, 0xf000			; CI-NEXT: s_load_dword s0, s[6:7], 0x0
	; CI-NEXT: s_mov_b64 s[2:3], s[10:11]			; CI-NEXT: s_load_dword s1, s[8:9], 0x2
	; CI-NEXT: s_waitcnt lgkmcnt(0)			; CI-NEXT: s_mov_b32 s6, -1
	; CI-NEXT: s_mov_b64 s[8:9], s[6:7]			; CI-NEXT: s_mov_b32 s7, 0xf000
	; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8			; CI-NEXT: s_waitcnt lgkmcnt(0)
	; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0			; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
	; CI-NEXT: s_mov_b64 s[6:7], s[10:11]			; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
	; CI-NEXT: s_waitcnt vmcnt(1)
	; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
	; CI-NEXT: s_waitcnt vmcnt(0)
	; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
	; CI-NEXT: v_rcp_f32_e32 v2, v1			; CI-NEXT: v_rcp_f32_e32 v2, v1
	; CI-NEXT: v_mul_f32_e32 v2, v0, v2			; CI-NEXT: v_mul_f32_e32 v2, v0, v2
	; CI-NEXT: v_trunc_f32_e32 v2, v2			; CI-NEXT: v_trunc_f32_e32 v2, v2
	; CI-NEXT: v_fma_f32 v0, -v2, v1, v0			; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
	; CI-NEXT: v_cvt_f16_f32_e32 v0, v0			; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
	; CI-NEXT: buffer_store_short v0, off, s[4:7], 0			; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
	; CI-NEXT: s_endpgm			; CI-NEXT: s_endpgm
	;			;
	; VI-LABEL: unsafe_frem_f16:			; VI-LABEL: unsafe_frem_f16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s6			; VI-NEXT: s_load_dword s0, s[6:7], 0x0
	; VI-NEXT: s_add_u32 s0, s8, 8			; VI-NEXT: s_load_dword s1, s[8:9], 0x8
	; VI-NEXT: v_mov_b32_e32 v1, s7			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: s_addc_u32 s1, s9, 0			; VI-NEXT: v_mov_b32_e32 v1, s0
	; VI-NEXT: flat_load_ushort v2, v[0:1]			; VI-NEXT: v_rcp_f16_e32 v0, s1
	; VI-NEXT: v_mov_b32_e32 v0, s0			; VI-NEXT: v_mul_f16_e32 v0, s0, v0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_trunc_f16_e32 v0, v0
	; VI-NEXT: flat_load_ushort v0, v[0:1]			; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
	; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_rcp_f16_e32 v1, v0
	; VI-NEXT: v_mul_f16_e32 v1, v2, v1
	; VI-NEXT: v_trunc_f16_e32 v1, v1
	; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: flat_store_short v[0:1], v2			; VI-NEXT: flat_store_short v[0:1], v2
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4			%gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
	%r0 = load half, half addrspace(1)* %in1, align 4			%r0 = load half, half addrspace(1)* %in1, align 4
	%r1 = load half, half addrspace(1)* %gep2, align 4			%r1 = load half, half addrspace(1)* %gep2, align 4
	%r2 = frem half %r0, %r1			%r2 = frem half %r0, %r1
	▲ Show 20 Lines • Show All 956 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

This file was added.

				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck %s

				define amdgpu_kernel void @constant_load_i8(i8 addrspace (1)* %out, i8 addrspace(4)* %in) #0 {
				; CHECK-LABEL: {{^}}constant_load_i8:
				; CHECK: s_load_dword s2, s[2:3], 0x0
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				%ld = load i8, i8 addrspace(4)* %in, align 4
				store i8 %ld, i8 addrspace(1)* %out, align 4
				ret void
				}

				define amdgpu_kernel void @constant_load_i16(i16 addrspace (1)* %out, i16 addrspace(4)* %in) #0 {
				; CHECK-LABEL: {{^}}constant_load_i16:
				; CHECK: s_load_dword s2, s[2:3], 0x0
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				%ld = load i16, i16 addrspace(4)* %in, align 4
				store i16 %ld, i16 addrspace(1)* %out, align 4
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 337299

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loadsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 337299

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

[AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
ClosedPublic