diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -492,7 +492,7 @@ LLT ResTy = Res.getLLTTy(*getMRI()); auto Mask = buildConstant( ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp)); - return buildAnd(ResTy, Op, Mask); + return buildAnd(Res, Op, Mask); } MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -443,9 +443,8 @@ const unsigned AS = MMO->getAddrSpace(); const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; - - // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. - return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && + // Require 4-byte alignment. + return MMO->getAlign() >= Align(4) && // Can't do a scalar atomic load. !MMO->isAtomic() && // Don't use scalar loads for volatile accesses to non-constant address @@ -1148,31 +1147,59 @@ const RegisterBank *PtrBank = OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; if (PtrBank == &AMDGPU::SGPRRegBank) { - // If the pointer is an SGPR, we ordinarily have nothing to do. - if (LoadSize != 96) + // There are some special cases that we need to look at for 32 bit and 96 + // bit SGPR loads otherwise we have nothing to do. + if (LoadSize != 32 && LoadSize != 96) return false; MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned MemSize = 8 * MMO->getSize(); + // Scalar loads of size 8 or 16 bit with proper alignment may be widen to 32 + // bit. Check to see if we need to widen the memory access, 8 or 16 bit + // scalar loads should have a load size of 32 but memory access size of less + // than 32. + if (LoadSize == 32 && + (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) + return false; + Register PtrReg = MI.getOperand(1).getReg(); - // 96-bit loads are only available for vector loads. We need to split this - // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); MachineIRBuilder B(MI, O); - if (MMO->getAlign() < Align(16)) { - LLT Part64, Part32; - std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + if (LoadSize == 32) { + // A scalar load was legally widen to 32 bit but the memory access has not + // been widen yet to the correct size yet so widen it here to 32 bit. + if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { + // Must extend the sign bit into higher bits for a G_SEXTLOAD + const LLT S32 = LLT::scalar(32); + auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); + B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); + } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { + // Must extend zero into higher bits with an AND for a G_ZEXTLOAD + const LLT S32 = LLT::scalar(32); + auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); + B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); + } else + // We do not need to touch the higher bits for regular loads. + B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); } else { - LLT WiderTy = widen96To128(LoadTy); - auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - B.buildExtract(MI.getOperand(0), WideLoad, 0); + // 96-bit loads are only available for vector loads. We need to split this + // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). + if (MMO->getAlign() < Align(16)) { + LLT Part64, Part32; + std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); + auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); + auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); + + auto Undef = B.buildUndef(LoadTy); + auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); + B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + } else { + LLT WiderTy = widen96To128(LoadTy); + auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); + B.buildExtract(MI.getOperand(0), WideLoad, 0); + } } MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -6,18 +6,13 @@ ; CI-LABEL: frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 @@ -30,7 +25,8 @@ ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -44,24 +40,18 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; VI-NEXT: v_rcp_f32_e32 v3, v3 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: v_rcp_f32_e32 v2, v2 +; VI-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, v1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -78,19 +68,15 @@ ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -104,19 +90,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v0 -; VI-NEXT: v_mul_f16_e32 v1, v2, v1 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mul_f16_e32 v0, s0, v0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, s1, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -133,19 +114,15 @@ ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x2 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -159,19 +136,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v0 -; VI-NEXT: v_mul_f16_e32 v1, v2, v1 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mul_f16_e32 v0, s0, v0 +; VI-NEXT: v_trunc_f16_e32 v0, v0 +; VI-NEXT: v_fma_f16 v2, -v0, s1, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir @@ -0,0 +1,289 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=fiji -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +--- +name: i8_to_i32_spgr_align8 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: i8_to_i32_spgr_align8 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4) + ; GFX9-LABEL: name: i8_to_i32_spgr_align8 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4) + ; GFX10-LABEL: name: i8_to_i32_spgr_align8 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 8, addrspace 4 ) +... +--- +name: i8_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: i8_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9-LABEL: name: i8_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10-LABEL: name: i8_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 4, addrspace 4 ) +... +--- +name: i16_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: i16_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9-LABEL: name: i16_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10-LABEL: name: i16_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 2, align 4, addrspace 4 ) +... +--- +name: sextload_i8_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: sextload_i8_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 + ; GFX9-LABEL: name: sextload_i8_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 + ; GFX10-LABEL: name: sextload_i8_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4 ) +... +--- +name: sextload_i16_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: sextload_i16_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16 + ; GFX9-LABEL: name: sextload_i16_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16 + ; GFX10-LABEL: name: sextload_i16_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16 + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4 ) +... + +--- +name: zextload_i8_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: zextload_i8_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + ; GFX9-LABEL: name: zextload_i8_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + ; GFX10-LABEL: name: zextload_i8_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4 ) +... +--- +name: zextload_i16_to_i32_spgr_align4 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: zextload_i16_to_i32_spgr_align4 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + ; GFX9-LABEL: name: zextload_i16_to_i32_spgr_align4 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + ; GFX10-LABEL: name: zextload_i16_to_i32_spgr_align4 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4) + ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4 ) +... +--- +name: i8_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: i8_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX9-LABEL: name: i8_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX10-LABEL: name: i8_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 2, addrspace 4 ) +... + +--- +name: i16_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: i16_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX9-LABEL: name: i16_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX10-LABEL: name: i16_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 2, align 2, addrspace 4 ) +... +--- +name: sext_i8_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: sext_i8_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX9-LABEL: name: sext_i8_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX10-LABEL: name: sext_i8_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4 ) +... +--- +name: sext_i16_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: sext_i16_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX9-LABEL: name: sext_i16_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX10-LABEL: name: sext_i16_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4 ) +... +--- +name: zext_i8_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: zext_i8_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX9-LABEL: name: zext_i8_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + ; GFX10-LABEL: name: zext_i8_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4 ) +... +--- +name: zext_i16_to_i32_vgpr_align2 +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX8-LABEL: name: zext_i16_to_i32_vgpr_align2 + ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX9-LABEL: name: zext_i16_to_i32_vgpr_align2 + ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + ; GFX10-LABEL: name: zext_i16_to_i32_vgpr_align2 + ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4 ) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_kernel void @constant_load_i8_align4(i8 addrspace (1)* %out, i8 addrspace(4)* %in) #0 { +; GFX8-LABEL: constant_load_i8_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_load_i8_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_load_i8_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %ld = load i8, i8 addrspace(4)* %in, align 4 + store i8 %ld, i8 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @constant_load_i16_align4(i16 addrspace (1)* %out, i16 addrspace(4)* %in) #0 { +; GFX8-LABEL: constant_load_i16_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_load_i16_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_load_i16_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %ld = load i16, i16 addrspace(4)* %in, align 4 + store i16 %ld, i16 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX8-LABEL: sextload_i8_to_i32_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sextload_i8_to_i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sext_i32_i8 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sextload_i8_to_i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i8 s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in, align 4 + %sext = sext i8 %load to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @sextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +; GFX8-LABEL: sextload_i16_to_i32_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sextload_i16_to_i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sextload_i16_to_i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i16, i16 addrspace(1)* %in, align 4 + %sext = sext i16 %load to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @zextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX8-LABEL: zextload_i8_to_i32_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: zextload_i8_to_i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zextload_i8_to_i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in, align 4 + %zext = zext i8 %load to i32 + store i32 %zext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @zextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +; GFX8-LABEL: zextload_i16_to_i32_align4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: zextload_i16_to_i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: zextload_i16_to_i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i16, i16 addrspace(1)* %in, align 4 + %zext = zext i16 %load to i32 + store i32 %zext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX8-LABEL: constant_load_i8_align2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_load_i8_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_load_i8_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in, align 2 + store i8 %load, i8 addrspace(1)* %out, align 2 + ret void +} + +define amdgpu_kernel void @constant_load_i16_align2(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +; GFX8-LABEL: constant_load_i16_align2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_load_i16_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_load_i16_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm + %load = load i16, i16 addrspace(1)* %in, align 2 + store i16 %load, i16 addrspace(1)* %out, align 2 + ret void +} + +define amdgpu_kernel void @constant_sextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX8-LABEL: constant_sextload_i8_align2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_sbyte v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_sextload_i8_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_sextload_i8_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in, align 2 + %sextload = sext i8 %load to i32 + store i32 %sextload, i32 addrspace(1)* %out, align 2 + ret void +} + +define amdgpu_kernel void @constant_zextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +; GFX8-LABEL: constant_zextload_i8_align2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_zextload_i8_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: constant_zextload_i8_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-NEXT: s_endpgm + %load = load i8, i8 addrspace(1)* %in, align 2 + %zextload = zext i8 %load to i32 + store i32 %zextload, i32 addrspace(1)* %out, align 2 + ret void +} + +attributes #0 = { nounwind }