Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1074,6 +1074,20 @@ [IntrNoMem, IntrSpeculatable]>; // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. +// __builtin_amdgcn_interp_p1_f16 , , , , +def int_amdgcn_interp_p1_f16 : + GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">, + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + +// __builtin_amdgcn_interp_p2_f16 , , , , , +def int_amdgcn_interp_p2_f16 : + GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">, + Intrinsic<[llvm_half_ty], + [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + // Pixel shaders only: whether the current pixel is live (i.e. not a helper // invocation for derivative computation). def int_amdgcn_ps_live : Intrinsic < Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -455,6 +455,9 @@ INTERP_MOV, INTERP_P1, INTERP_P2, + INTERP_P1LL_F16, + INTERP_P1LV_F16, + INTERP_P2_F16, PC_ADD_REL_OFFSET, KILL, DUMMY_CHAIN, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -845,6 +845,9 @@ case AMDGPUISD::INTERP_MOV: case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: + case AMDGPUISD::INTERP_P1LL_F16: + case AMDGPUISD::INTERP_P1LV_F16: + case AMDGPUISD::INTERP_P2_F16: return true; } return false; @@ -4176,6 +4179,9 @@ NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) + NODE_NAME_CASE(INTERP_P1LL_F16) + NODE_NAME_CASE(INTERP_P1LV_F16) + NODE_NAME_CASE(INTERP_P2_F16) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -370,6 +370,17 @@ SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; +def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", + SDTypeProfile<1, 7, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", + SDTypeProfile<1, 9, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", + SDTypeProfile<1, 8, [SDTCisFP<0>]>, + [SDNPInGlue]>; def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; Index: lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -44,6 +44,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4620,6 +4620,59 @@ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_interp_p1_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = M0.getValue(1); + if (getSubtarget()->getLDSBankCount() == 16) { + // 16 bank LDS + SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Glue); + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + S, // Src2 - holds two f16 values selected by high + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i1) // $omod + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); + } else { + // 32 bank LDS + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i1), // $omod + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); + } + } + case Intrinsic::amdgcn_interp_p2_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); + SDValue Glue = SDValue(M0.getNode(), 1); + SDValue Ops[] = { + Op.getOperand(2), // Src0 + Op.getOperand(3), // Attrchan + Op.getOperand(4), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(1), // Src2 + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(5), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); + } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -219,7 +219,8 @@ // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp : VOP3_Pseudo { +class VOP3Interp pattern = []> : + VOP3_Pseudo { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -428,8 +429,17 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; -def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +let Uses = [M0, EXEC] in { +def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp)))]>; +} // End Uses = [M0, EXEC] +} // End renamedInGfx9 = 1 let SubtargetPredicate = isGFX9 in { def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile>; @@ -439,8 +449,24 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 -def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +let Uses = [M0, EXEC] in { +def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i1 imm:$omod)))]>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i1 imm:$omod)))]>; +} // End Uses = [M0, EXEC] } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.interp.f16.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.interp.f16.ll @@ -0,0 +1,24 @@ +; RUN: opt -mtriple=amdgcn-- -verify-machineinstrs -analyze -divergence < %s | FileCheck %s + +; CHECK: DIVERGENT: %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 false, i32 %m0) +; CHECK: DIVERGENT: %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 false, i32 %m0) +; CHECK: DIVERGENT: %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 true, i32 %m0) +; CHECK: DIVERGENT: %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 true, i32 %m0) + +define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +main_body: + %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0) + %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0) + %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0) + %res = fadd half %p2_0, %p2_1 + ret half %res +} + +; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0) +declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0 +; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0) +declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9_32BANK %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_32BANK %s +; RUN: llc -march=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_16BANK %s + +; GFX9_32BANK-LABEL: {{^}}interp_f16: +; GFX9_32BANK: s_mov_b32 m0, s{{[0-9]+}} +; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}} +; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high +; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +; GFX8_32BANK-LABEL: {{^}}interp_f16: +; GFX8_32BANK: s_mov_b32 m0, s{{[0-9]+}} +; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}} +; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high +; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +; GFX8_16BANK-LABEL: {{^}}interp_f16: +; GFX8_16BANK: s_mov_b32 m0, s{{[0-9]+}} +; there should be only one v_interp_mov +; GFX8_16BANK: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y +; GFX8_16BANK-NOT: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y +; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high +; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +main_body: + %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0) + %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0) + %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0) + %res = fadd half %p2_0, %p2_1 + ret half %res +} + +; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0) +declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0 +; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0) +declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone }