diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1505,6 +1505,34 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>, ImmArg>]>; +// __int_amdgcn_interp_inreg_p10

, , +def int_amdgcn_interp_inreg_p10 : + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + +// __int_amdgcn_interp_inreg_p2

, , +def int_amdgcn_interp_inreg_p2 : + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + +// __int_amdgcn_interp_inreg_p10_f16

, , , +// high selects whether high or low 16-bits are used for p and p0 operands +def int_amdgcn_interp_inreg_p10_f16: + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, + ImmArg>]>; + +// __int_amdgcn_interp_inreg_p2_f16

, , , +// high selects whether high or low 16-bits are used for p operand +def int_amdgcn_interp_inreg_p2_f16 : + Intrinsic<[llvm_half_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, + ImmArg>]>; + // Deprecated: use llvm.amdgcn.live.mask instead. def int_amdgcn_ps_live : Intrinsic < [llvm_i1_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -59,6 +59,14 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vinterpmods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_vinterpmods_hi : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? def gi_vop3opsel : GIComplexOperandMatcher, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -218,6 +218,11 @@ bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods, + bool OpSel) const; + bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,7 +13,9 @@ #include "AMDGPUISelDAGToDAG.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600RegisterInfo.h" #include "SIMachineFunctionInfo.h" @@ -2606,6 +2608,30 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src, + SDValue &SrcMods, + bool OpSel) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (OpSel) + Mods |= SISrcMods::OP_SEL_0; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false); +} + +bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -150,8 +150,9 @@ bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; - std::pair selectVOP3ModsImpl(MachineOperand &Root, - bool AllowAbs = true) const; + std::pair + selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true, + bool OpSel = false, bool ForceVGPR = false) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -191,6 +192,11 @@ InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVINTERPMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVINTERPModsHi(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3497,9 +3497,8 @@ } -std::pair -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool AllowAbs) const { +std::pair AMDGPUInstructionSelector::selectVOP3ModsImpl( + MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const { Register Src = Root.getReg(); Register OrigSrc = Src; unsigned Mods = 0; @@ -3516,7 +3515,10 @@ Mods |= SISrcMods::ABS; } - if (Mods != 0 && + if (OpSel) + Mods |= SISrcMods::OP_SEL_0; + + if ((Mods != 0 || ForceVGPR) && RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { MachineInstr *UseMI = Root.getParent(); @@ -3708,6 +3710,36 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ false, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ true, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector AddrInfo; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3017,6 +3017,12 @@ constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index return; } + case Intrinsic::amdgcn_interp_inreg_p10: + case Intrinsic::amdgcn_interp_inreg_p2: + case Intrinsic::amdgcn_interp_inreg_p10_f16: + case Intrinsic::amdgcn_interp_inreg_p2_f16: + applyDefaultMapping(OpdMapper); + return; case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: { // Doing a waterfall loop over these wouldn't make any sense. @@ -4469,6 +4475,17 @@ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); break; } + case Intrinsic::amdgcn_interp_inreg_p10: + case Intrinsic::amdgcn_interp_inreg_p2: + case Intrinsic::amdgcn_interp_inreg_p10_f16: + case Intrinsic::amdgcn_interp_inreg_p2_f16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + break; + } case Intrinsic::amdgcn_ballot: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -111,10 +111,55 @@ } // SubtargetPredicate = isGFX11Plus +class VInterpF32Pat : GCNPat < + (f32 (op + (VINTERPMods f32:$src0, i32:$src0_modifiers), + (VINTERPMods f32:$src1, i32:$src1_modifiers), + (VINTERPMods f32:$src2, i32:$src2_modifiers))), + (inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + 0, /* clamp */ + 7) /* wait_exp */ +>; + def VINTERP_OPSEL { int LOW = 0; int HIGH = 0xa; } + +class VInterpF16Pat pat> : GCNPat < + (dst_type (op + (pat[0] f32:$src0, i32:$src0_modifiers), + (pat[1] f32:$src1, i32:$src1_modifiers), + (pat[2] f32:$src2, i32:$src2_modifiers), + !if(high, (i1 -1), (i1 0)))), + (inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + 0, /* clamp */ + /* op_sel = 0 */ + 7) /* wait_exp */ +>; + +multiclass VInterpF16Pat high_pat> { + def : VInterpF16Pat; + def : VInterpF16Pat; +} + +def : VInterpF32Pat; +def : VInterpF32Pat; +defm : VInterpF16Pat; +defm : VInterpF16Pat; + //===----------------------------------------------------------------------===// // VINTERP Real Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: lds_param_load v0, attr0.y +; GCN-NEXT: lds_param_load v1, attr1.x +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 +; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done +; GCN-NEXT: s_endpgm +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32_many: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: lds_param_load v1, attr1.x +; GCN-NEXT: lds_param_load v2, attr2.x +; GCN-NEXT: lds_param_load v3, attr3.x +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7 +; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done +; GCN-NEXT: s_endpgm +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0) + %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2) + %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0) + %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3) + %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32_many_vm: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: lds_param_load v2, attr0.x +; GCN-NEXT: lds_param_load v3, attr1.x +; GCN-NEXT: lds_param_load v4, attr2.x +; GCN-NEXT: lds_param_load v5, attr3.x +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done +; GCN-NEXT: s_endpgm +main_body: + %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1 + %i = load float, float addrspace(1)* %i.ptr, align 4 + %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2 + %j = load float, float addrspace(1)* %j.ptr, align 4 + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0) + %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2) + %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0) + %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3) + %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + +declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 +declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 +declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: lds_param_load v0, attr0.y +; GCN-NEXT: lds_param_load v1, attr1.x +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 +; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done +; GCN-NEXT: s_endpgm +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32_many: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: lds_param_load v1, attr1.x +; GCN-NEXT: lds_param_load v2, attr2.x +; GCN-NEXT: lds_param_load v3, attr3.x +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7 +; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done +; GCN-NEXT: s_endpgm +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0) + %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2) + %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0) + %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3) + %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f32_many_vm: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: lds_param_load v2, attr0.x +; GCN-NEXT: lds_param_load v3, attr1.x +; GCN-NEXT: lds_param_load v4, attr2.x +; GCN-NEXT: lds_param_load v5, attr3.x +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done +; GCN-NEXT: s_endpgm +main_body: + %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1 + %i = load float, float addrspace(1)* %i.ptr, align 4 + %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2 + %j = load float, float addrspace(1)* %j.ptr, align 4 + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) + %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0) + %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0) + %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0) + %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0) + %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1) + %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0) + %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2) + %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0) + %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3) + %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + +declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 +declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 +declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }