diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -444,8 +444,8 @@ const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; - // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. - return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && + // Require 4-byte alignment. + return MMO->getSize() >= 1 && MMO->getAlign() >= Align(4) && // Can't do a scalar atomic load. !MMO->isAtomic() && // Don't use scalar loads for volatile accesses to non-constant address @@ -1148,31 +1148,47 @@ const RegisterBank *PtrBank = OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; if (PtrBank == &AMDGPU::SGPRRegBank) { - // If the pointer is an SGPR, we ordinarily have nothing to do. - if (LoadSize != 96) + MachineMemOperand *MMO = *MI.memoperands_begin(); + // There are some special cases that we need to look at for 32 bit and 96 + // bit SGPR loads otherwise we have nothing to do. + if (LoadSize != 32 && LoadSize != 96) + return false; + + // Scalar loads of size 8 or 16 bit with proper alignment may be widen to 32 + // bit. + if (LoadSize == 32 && (MMO->getSize() >= 4 || + MI.getOpcode() != AMDGPU::G_LOAD || + LoadTy.isVector() || + !isScalarLoadLegal(MI))) return false; - MachineMemOperand *MMO = *MI.memoperands_begin(); Register PtrReg = MI.getOperand(1).getReg(); - // 96-bit loads are only available for vector loads. We need to split this - // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); MachineIRBuilder B(MI, O); - if (MMO->getAlign() < Align(16)) { - LLT Part64, Part32; - std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); - } else { - LLT WiderTy = widen96To128(LoadTy); + if (LoadSize == 32) { + // Widen memory access for 8 or 16 bit scalar loads to 32 bit. + LLT WiderTy = LLT::scalar(32); auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); B.buildExtract(MI.getOperand(0), WideLoad, 0); + } else { + // 96-bit loads are only available for vector loads. We need to split this + // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). + if (MMO->getAlign() < Align(16)) { + LLT Part64, Part32; + std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); + auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); + auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); + + auto Undef = B.buildUndef(LoadTy); + auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); + B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + } else { + LLT WiderTy = widen96To128(LoadTy); + auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); + B.buildExtract(MI.getOperand(0), WideLoad, 0); + } } MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -6,18 +6,13 @@ ; CI-LABEL: frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 @@ -30,7 +25,8 @@ ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -44,24 +40,18 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; VI-NEXT: v_rcp_f32_e32 v3, v3 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: v_rcp_f32_e32 v2, v2 +; VI-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, v1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -78,19 +68,15 @@ ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -104,19 +90,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v0 -; VI-NEXT: v_mul_f16_e32 v1, v2, v1 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mul_f16_e32 v0, s0, v0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, s1, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -133,19 +114,15 @@ ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -159,19 +136,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v0 -; VI-NEXT: v_mul_f16_e32 v1, v2, v1 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mul_f16_e32 v0, s0, v0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, s1, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -0,0 +1,20 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_kernel void @constant_load_i8(i8 addrspace (1)* %out, i8 addrspace(4)* %in) #0 { +; CHECK-LABEL: {{^}}constant_load_i8: +; CHECK: s_load_dword s2, s[2:3], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) + %ld = load i8, i8 addrspace(4)* %in, align 4 + store i8 %ld, i8 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @constant_load_i16(i16 addrspace (1)* %out, i16 addrspace(4)* %in) #0 { +; CHECK-LABEL: {{^}}constant_load_i16: +; CHECK: s_load_dword s2, s[2:3], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) + %ld = load i16, i16 addrspace(4)* %in, align 4 + store i16 %ld, i16 addrspace(1)* %out, align 4 + ret void +} +