Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -435,6 +435,10 @@ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: + return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; + case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; default: return -1; } @@ -456,6 +460,18 @@ return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; + case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; default: return -1; } @@ -479,7 +495,7 @@ return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .add(*Reg) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) @@ -488,6 +504,11 @@ .addImm(0) // slc .addImm(0) // tfe .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + const MachineOperand *VDataIn = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata_in); + if (VDataIn) + NewMI.add(*VDataIn); return true; } Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: ; GCN: s_waitcnt Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: ; GCN: s_waitcnt @@ -588,4 +588,63 @@ ret void } +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 + +; VI: buffer_load_ushort v +define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i16], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 + %load = load volatile i16, i16* %gep + %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 + +; VI: buffer_load_sbyte v +define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8* %gep + %load.ext = sext i8 %load to i16 + %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 + +; VI: buffer_load_ubyte v +define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8* %gep + %load.ext = zext i8 %load to i16 + %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt @@ -591,4 +591,39 @@ ret void } +; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: +; GCN: s_waitcnt +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 +define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i16], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 + store i16 %hi, i16* %gep + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: +; GCN: s_waitcnt +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 +define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8* %gep + ret void +} + attributes #0 = { nounwind }