diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2004,6 +2004,18 @@ Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg>, IntrHasSideEffects, IntrWillReturn]>; +def int_amdgcn_ds_bvh_stack_rtn : + Intrinsic< + [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr + [ + llvm_i32_ty, // %addr + llvm_i32_ty, // %data0 + llvm_v4i32_ty, // %data1 + llvm_i32_ty, // %offset + ], + [ImmArg>, IntrWillReturn] + >; + // WMMA (Wave Matrix Multiply-Accumulate) intrinsics // // These operations perform a matrix multiplication and accumulation of diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -268,6 +268,7 @@ void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); + void SelectDSBvhStackIntrinsic(SDNode *N); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -43,7 +43,6 @@ //===----------------------------------------------------------------------===// namespace { - static SDValue stripBitcast(SDValue Val) { return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; } @@ -96,7 +95,7 @@ return In; } -} // end anonymous namespace +} // end anonymous namespace INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) @@ -2380,6 +2379,19 @@ CurDAG->setNodeMemRefs(cast(Selected), {MMO}); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { + unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32; + SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4), + N->getOperand(5), N->getOperand(0)}; + + MemIntrinsicSDNode *M = cast(N); + MachineMemOperand *MMO = M->getMemOperand(); + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast(Selected), {MMO}); +} + static unsigned gwsIntrinToOpcode(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_ds_gws_init: @@ -2532,6 +2544,9 @@ SelectDSAppendConsume(N, IntrID); return; } + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + SelectDSBvhStackIntrinsic(N); + return; } SelectCode(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -120,6 +120,7 @@ bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectSBarrier(MachineInstr &MI) const; + bool selectDSBvhStackIntrinsic(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1803,6 +1803,33 @@ return true; } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( + MachineInstr &MI) const { + Register Dst0 = MI.getOperand(0).getReg(); + Register Dst1 = MI.getOperand(1).getReg(); + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + Register Addr = MI.getOperand(3).getReg(); + Register Data0 = MI.getOperand(4).getReg(); + Register Data1 = MI.getOperand(5).getReg(); + unsigned Offset = MI.getOperand(6).getImm(); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) + .addDef(Dst1) + .addUse(Addr) + .addUse(Data0) + .addUse(Data1) + .addImm(Offset) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { unsigned IntrinsicID = I.getIntrinsicID(); @@ -1841,6 +1868,8 @@ return false; } break; + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + return selectDSBvhStackIntrinsic(I); } return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4745,6 +4745,20 @@ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + OpdsMapping[0] = + getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst + OpdsMapping[1] = + getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr + OpdsMapping[3] = + getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr + OpdsMapping[4] = + getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 + OpdsMapping[5] = + getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 + break; + } + default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6341,11 +6341,20 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, bool IsGdsHardcoded) { OptionalImmIndexMap OptionalIdx; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + auto TiedTo = + Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); + + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + Inst.addOperand(Inst.getOperand(TiedTo)); + } + // Add the register arguments if (Op.isReg()) { Op.addRegOperands(Inst, 1); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -277,6 +277,18 @@ } } +class DS_BVH_STACK +: DS_Pseudo.ret:$vdst, VGPR_32:$addr), + (ins VGPR_32:$addr_in, getLdStRegisterOperand.ret:$data0, VReg_128:$data1, offset:$offset), + " $vdst, $addr, $data0, $data1$offset"> { + let Constraints = "$addr = $addr_in"; + let DisableEncoding = "$addr_in"; + let has_gds = 0; + let gdsValue = 0; + let hasSideEffects = 1; + let SchedRW = [WriteLDS, WriteLDS]; +} class DS_1A_RET.ret> @@ -713,6 +725,7 @@ def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>; def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; +def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; } // let SubtargetPredicate = isGFX11Plus @@ -1250,6 +1263,7 @@ defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>; defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>; defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>; +defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1161,6 +1161,23 @@ MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + + const GCNTargetMachine &TM = + static_cast(getTargetMachine()); + + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.ptrVal = MFI->getGWSPSV(TM); + + // This is an abstract access, but we need to specify a type and size. + Info.memVT = MVT::i32; + Info.size = 4; + Info.align = Align(4); + + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } default: return false; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32, i32, <4 x i32>, i32 immarg) + +define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) { +; CHECK-LABEL: test_ds_bvh_stack: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; CHECK-NEXT: global_store_b32 v[6:7], v0, off +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + %res = add i32 %vdst, %newaddr + store i32 %res, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) { +; CHECK-LABEL: test_ds_bvh_stack_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] offset:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; CHECK-NEXT: global_store_b32 v[6:7], v0, off +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + %res = add i32 %vdst, %newaddr + store i32 %res, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_ds.s b/llvm/test/MC/AMDGPU/gfx11_asm_ds.s --- a/llvm/test/MC/AMDGPU/gfx11_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_ds.s @@ -1961,3 +1961,9 @@ ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip") // GFX11: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08] + +ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252] +// GFX11: [0x00,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff] + +ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252] offset:127 +// GFX11: [0x7f,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt @@ -5166,3 +5166,15 @@ # GFX11: ds_sub_gs_reg_rtn v[1:2], v255 gds ; encoding: [0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01] 0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01 + +# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01] +0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01 + +# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] ; encoding: [0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01] +0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01 + +# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe] +0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe + +# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] ; encoding: [0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe] +0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe