Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -9299,6 +9299,51 @@ Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::FLAT_LOAD_UBYTE_D16_HI: + case AMDGPU::FLAT_LOAD_SBYTE_D16_HI: + case AMDGPU::FLAT_LOAD_SHORT_D16_HI: + case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI: + case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI: + case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI: + case AMDGPU::DS_READ_U16_D16_HI: + case AMDGPU::DS_READ_I8_D16_HI: + case AMDGPU::DS_READ_U8_D16_HI: + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: { + // For these loads that write to the HI part of a register, + // we should chain them to the op that writes to the LO part + // of the register to maintain the order. + unsigned NumOps = Node->getNumOperands(); + SDValue OldChain = Node->getOperand(NumOps-1); + + if (OldChain.getValueType() != MVT::Other) + break; + + // Look for the chain to replace to. + SDValue Lo = Node->getOperand(NumOps-2); + SDNode *LoNode = Lo.getNode(); + if (LoNode->getNumValues() == 1 || + LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other) + break; + + SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1); + if (NewChain == OldChain) // Already replaced. + break; + + SmallVector Ops; + for (unsigned I = 0; I < NumOps-1; ++I) + Ops.push_back(Node->getOperand(I)); + // Repalce the Chain. + Ops.push_back(NewChain); + MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node), + Node->getVTList(), Ops); + DAG.setNodeMemRefs(NewNode, Node->memoperands()); + return NewNode; + } default: break; } Index: test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -0,0 +1,142 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}chain_hi_to_lo_private: +; CHECK: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]] +define <2 x half> @chain_hi_to_lo_private() { +bb: + %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 + %load_lo = load half, half addrspace(5)* %gep_lo + %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0 + %load_hi = load half, half addrspace(5)* %gep_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_private_different_bases: +; CHECK: buffer_load_ushort [[DST:v[0-9]+]], v{{[0-9]+}}, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, [[RSRC]], [[SOFF]] offen +define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) { +bb: + %load_lo = load half, half addrspace(5)* %base_lo + %load_hi = load half, half addrspace(5)* %base_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_arithmatic: +; CHECK: v_add_f16_e32 [[DST:v[0-9]+]], 1.0, v{{[0-9]+}} +; CHECK-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) { +bb: + %arith_lo = fadd half %in, 1.0 + %load_hi = load half, half addrspace(5)* %base + + %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_group: +; CHECK: ds_read_u16 [[DST:v[0-9]+]], [[ADDR:v[0-9]+]] offset:2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_read_u16_d16_hi [[DST]], [[ADDR]] +define <2 x half> @chain_hi_to_lo_group() { +bb: + %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 + %load_lo = load half, half addrspace(3)* %gep_lo + %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0 + %load_hi = load half, half addrspace(3)* %gep_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_group_different_bases: +; CHECK: ds_read_u16 [[DST:v[0-9]+]], v{{[0-9]+}} +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_read_u16_d16_hi [[DST]], v{{[0-9]+}} +define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) { +bb: + %load_lo = load half, half addrspace(3)* %base_lo + %load_hi = load half, half addrspace(3)* %base_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_global: +; CHECK: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off +; CHECK: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off +define <2 x half> @chain_hi_to_lo_global() { +bb: + %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 + %load_lo = load half, half addrspace(1)* %gep_lo + %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0 + %load_hi = load half, half addrspace(1)* %gep_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_global_different_bases: +; CHECK: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off +define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) { +bb: + %load_lo = load half, half addrspace(1)* %base_lo + %load_hi = load half, half addrspace(1)* %base_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_flat: +; CHECK: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} +; CHECK: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}} +define <2 x half> @chain_hi_to_lo_flat() { +bb: + %gep_lo = getelementptr inbounds half, half addrspace(0)* null, i64 1 + %load_lo = load half, half addrspace(0)* %gep_lo + %gep_hi = getelementptr inbounds half, half addrspace(0)* null, i64 0 + %load_hi = load half, half addrspace(0)* %gep_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} + +; CHECK-LABEL: {{^}}chain_hi_to_lo_flat_different_bases: +; CHECK: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}} +define <2 x half> @chain_hi_to_lo_flat_different_bases(half addrspace(0)* %base_lo, half addrspace(0)* %base_hi) { +bb: + %load_lo = load half, half addrspace(0)* %base_lo + %load_hi = load half, half addrspace(0)* %base_hi + + %temp = insertelement <2 x half> undef, half %load_lo, i32 0 + %result = insertelement <2 x half> %temp, half %load_hi, i32 1 + + ret <2 x half> %result +} +