Index: lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -240,8 +240,20 @@ if (!Cluster) return; - // Sort them in increasing order. - llvm::sort(Offsets); + // Sort them in increasing order, but don't value break dependencies between + // the loads. + llvm::sort( + Offsets.begin(), + Offsets.end(), + [&O2SMap](int64_t OffsetA, int64_t OffsetB) { + const SDNode *A = O2SMap[OffsetA]; + const SDNode *B = O2SMap[OffsetB]; + if (A->isOperandOf(B)) + return true; + if (B->isOperandOf(A)) + return false; + return OffsetA < OffsetB; + }); // Check if the loads are close enough. SmallVector Loads; @@ -252,7 +264,14 @@ for (unsigned i = 1, e = Offsets.size(); i != e; ++i) { int64_t Offset = Offsets[i]; SDNode *Load = O2SMap[Offset]; - if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads)) + + // These may not be perfectly sorted by offset if there are value + // dependencies between some of the loads. Sort them when asking the target. + int64_t LowOffset = std::min(BaseOff, Offset); + int64_t HighOffset = std::max(BaseOff, Offset); + + if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, + LowOffset, HighOffset, NumLoads)) break; // Stop right here. Ignore loads that are further away. Loads.push_back(Load); ++NumLoads; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -9367,51 +9367,6 @@ Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - case AMDGPU::FLAT_LOAD_UBYTE_D16_HI: - case AMDGPU::FLAT_LOAD_SBYTE_D16_HI: - case AMDGPU::FLAT_LOAD_SHORT_D16_HI: - case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI: - case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI: - case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI: - case AMDGPU::DS_READ_U16_D16_HI: - case AMDGPU::DS_READ_I8_D16_HI: - case AMDGPU::DS_READ_U8_D16_HI: - case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: - case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: - case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: { - // For these loads that write to the HI part of a register, - // we should chain them to the op that writes to the LO part - // of the register to maintain the order. - unsigned NumOps = Node->getNumOperands(); - SDValue OldChain = Node->getOperand(NumOps-1); - - if (OldChain.getValueType() != MVT::Other) - break; - - // Look for the chain to replace to. - SDValue Lo = Node->getOperand(NumOps-2); - SDNode *LoNode = Lo.getNode(); - if (LoNode->getNumValues() == 1 || - LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other) - break; - - SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1); - if (NewChain == OldChain) // Already replaced. - break; - - SmallVector Ops; - for (unsigned I = 0; I < NumOps-1; ++I) - Ops.push_back(Node->getOperand(I)); - // Repalce the Chain. - Ops.push_back(NewChain); - MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node), - Node->getVTList(), Ops); - DAG.setNodeMemRefs(NewNode, Node->memoperands()); - return NewNode; - } default: break; } Index: test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}chain_hi_to_lo_private: ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2 @@ -139,3 +139,39 @@ ret <2 x half> %result } + +; Make sure we don't lose any of the private stores. +; GCN-LABEL: {{^}}vload2_private: +; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4 +; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8 + +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6 +; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8 +define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { +entry: + %loc = alloca [3 x i16], align 2, addrspace(5) + %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* + %tmp = load i16, i16 addrspace(1)* %in, align 2 + %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 + store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx + %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 + %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2 + %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 + store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3 + %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2 + %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2 + %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2 + store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx + %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)* + %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2 + store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4 + %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 + %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)* + %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2 + %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1 + store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4 + %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* + ret void +}