Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -132,7 +132,7 @@ bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC, SDValue &TFE) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, + bool SelectSMRDOffset(SDValue ByteOffsetNode, bool AnyReg, SDValue &Offset, bool &Imm) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const; @@ -142,6 +142,7 @@ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferGLC(SDValue GLC, SDValue &Out) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -1186,21 +1187,31 @@ isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); } -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, bool AnyReg, SDValue &Offset, bool &Imm) const { - - // FIXME: Handle non-constant offsets. ConstantSDNode *C = dyn_cast(ByteOffsetNode); - if (!C) - return false; + if (!C) { + Offset = ByteOffsetNode; + Imm = false; + return AnyReg; + } SDLoc SL(ByteOffsetNode); AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); int64_t ByteOffset = C->getSExtValue(); - int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? - ByteOffset >> 2 : ByteOffset; - if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + bool Aligned; + int64_t EncodedOffset; + + if (Gen <= AMDGPUSubtarget::SEA_ISLANDS) { + Aligned = ByteOffset % 4 == 0; + EncodedOffset = ByteOffset >> 2; + } else { + Aligned = true; + EncodedOffset = ByteOffset; + } + + if (Aligned && isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); Imm = true; return true; @@ -1209,7 +1220,8 @@ if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) return false; - if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { + if (Gen == AMDGPUSubtarget::SEA_ISLANDS && + Aligned && isUInt<32>(EncodedOffset)) { // 32-bit Immediates are supported on Sea Islands. Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); } else { @@ -1222,13 +1234,13 @@ } bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - if (SelectSMRDOffset(N1, Offset, Imm)) { + if (SelectSMRDOffset(N1, false, Offset, Imm)) { SBase = N0; return true; } @@ -1268,7 +1280,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const { bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && Imm; + return SelectSMRDOffset(Addr, true, Offset, Imm) && Imm; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, @@ -1277,7 +1289,7 @@ return false; bool Imm; - if (!SelectSMRDOffset(Addr, Offset, Imm)) + if (!SelectSMRDOffset(Addr, true, Offset, Imm)) return false; return !Imm && isa(Offset); @@ -1286,10 +1298,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const { bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && + return SelectSMRDOffset(Addr, true, Offset, Imm) && !Imm && !isa(Offset); } +bool AMDGPUDAGToDAGISel::SelectSMRDBufferGLC(SDValue GLC, + SDValue &Out) const { + ConstantSDNode *C = dyn_cast(GLC); + if (!C) + return false; + + // Only VI supports GLC=1 for SMRD. + if (Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS && + C->getZExtValue()) + return false; + + Out = GLC; + return true; +} + bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2968,9 +2968,48 @@ .addReg(AMDGPU::VCC); break; + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: { + unsigned ResultReg; + unsigned NewOpcode; + + switch (Opcode) { + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; + ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + break; + } + + MachineOperand &Dest = Inst.getOperand(0); + + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), ResultReg) + .addReg(Inst.getOperand(2).getReg()) // offset + .addReg(Inst.getOperand(1).getReg()) // rsrc + .addImm(0) // soffset + .addImm(0) // inst_offset + .addImm(Inst.getOperand(3).getImm()) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addMemOperand(*Inst.memoperands_begin()); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + Inst.eraseFromParent(); + continue; + } + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); + llvm_unreachable("moveToVALU: S_BFE_U64 and S_BFM_B64 not implemented"); } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -275,7 +275,7 @@ let AllocationPriority = 7; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, v2f32, i64, f64], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 8; } @@ -284,13 +284,13 @@ let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, v2f32, i64, f64, i1], 32, (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, v2f32, i64, f64, i1], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -299,7 +299,7 @@ // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add SGPR_128Regs)> { let AllocationPriority = 10; } @@ -307,7 +307,7 @@ let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -236,9 +236,11 @@ def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; def SMRDSgpr : ComplexPattern; +def SMRDSgprConst : ComplexPattern; def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgpr : ComplexPattern; +def SMRDBufferGLC : ComplexPattern; let Predicates = [isGCN] in { @@ -257,6 +259,23 @@ >; } +multiclass SMRD_LoadIntrinsicPat { + def : Pat< + (vt (name v4i32:$rsrc, 0, + (SMRDBufferImm i32:$offset), + (SMRDBufferGLC i32:$glc), 0)), + (!cast(opcode # _IMM) $rsrc, $offset, (as_i1imm $glc)) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, + (SMRDBufferSgpr i32:$offset), + (SMRDBufferGLC i32:$glc), 0)), + (!cast(opcode # _SGPR) $rsrc, $offset, (as_i1imm $glc)) + >; +} + let Predicates = [isSICI] in { def : Pat < (i64 (readcyclecounter)), @@ -275,6 +294,10 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; +defm : SMRD_LoadIntrinsicPat; +defm : SMRD_LoadIntrinsicPat; +defm : SMRD_LoadIntrinsicPat; + // 1. Offset as an immediate def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -1,10 +1,12 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SI -check-prefix=SICI -check-prefix=SIVI +;RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=CI -check-prefix=SICI -check-prefix=CIVI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI -check-prefix=CIVI -check-prefix=SIVI ;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc +;CHECK-DAG: s_buffer_load_dwordx4 s[{{[0-9:]+}}], s[0:3], 0x0 +;VI-DAG: s_buffer_load_dwordx4 s[{{[0-9:]+}}], s[0:3], 0x0 glc +;SICI-DAG: buffer_load_dwordx4 v[{{[0-9:]+}}], off, s[0:3], 0 glc +;CHECK-DAG: buffer_load_dwordx4 v[{{[0-9:]+}}], off, s[0:3], 0 slc ;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: @@ -18,7 +20,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +;SICI: s_mov_b32 s4, 42 +;SICI: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +;VI: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x2a ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: @@ -27,9 +31,12 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen -;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff -;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 +;SI: s_movk_i32 s4, 0x2000 +;SI: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +;TODO: this should use SMEM: +;CI: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000 +;CI: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;VI: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x2000 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -46,6 +53,15 @@ ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_ofs_smem: +;CHECK: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_smem(<4 x i32> inreg, i32 inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt @@ -55,8 +71,22 @@ ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_ofs_imm_smem: +;CHECK: s_add_i32 s4, s4, 58 +;CHECK: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm_smem(<4 x i32> inreg, i32 inreg) { +main_body: + %ofs = add i32 %1, 58 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + ret <4 x float> %data +} + ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 +;TODO: v_add could be folded into VMEM: +;CHECK: v_add_i32_e32 v0, vcc, 58, v0 +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;XCHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: @@ -102,6 +132,16 @@ ret <2 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_negative_offset_smem: +;CHECK: s_add_i32 s4, s4, -16 +;CHECK: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +define amdgpu_ps <4 x float> @buffer_load_negative_offset_smem(<4 x i32> inreg, i32 inreg %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) + ret <4 x float> %data +} + ;CHECK-LABEL: {{^}}buffer_load_negative_offset: ;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen