Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -8998,6 +8998,42 @@ Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: { + // For these loads that write to the HI part of a register, + // we should chain them to the op that writes to the LO part + // of the register to maintain the order. + unsigned NumOps = Node->getNumOperands(); + SDValue OldChain = Node->getOperand(NumOps-1); + + if (OldChain.getValueType() != MVT::Other) + break; + + // Look for the chain to replace to. + SDValue Lo = Node->getOperand(NumOps-2); + SDNode *LoNode = Lo.getNode(); + if (LoNode->getNumValues() == 1 || + LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other) + break; + + SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1); + if (NewChain == OldChain) // Already replaced. + break; + + SmallVector Ops; + for (unsigned I = 0; I < NumOps-1; ++I) + Ops.push_back(Node->getOperand(I)); + // Repalce the Chain. + Ops.push_back(NewChain); + MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node), + Node->getVTList(), Ops); + DAG.setNodeMemRefs(NewNode, Node->memoperands()); + return NewNode; + } default: break; } Index: test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}chain_hi_to_lo: +; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]] + +define amdgpu_kernel void @chain_hi_to_lo() { +bb: + %loads = load <2 x half>, <2 x half> addrspace(5)* null, align 2 + %shuffled = shufflevector <2 x half> %loads, <2 x half> undef, <2 x i32> + br label %bb1 + +bb1: + call void asm sideeffect "; use $0", "v"(<2 x half> %shuffled) + br label %bb1 +} +