Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -206,6 +206,10 @@ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; +def int_amdgcn_cvt_pkrtz : Intrinsic< + [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] +>; + def int_amdgcn_class : Intrinsic< [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem] >; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -318,6 +318,11 @@ CVT_F32_UBYTE1, CVT_F32_UBYTE2, CVT_F32_UBYTE3, + + // Convert two float 32 numbers into a single register holding two packed f16 + // with round to zero. + CVT_PKRTZ_F16_F32, + /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3458,6 +3458,7 @@ NODE_NAME_CASE(CVT_F32_UBYTE1) NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(CVT_PKRTZ_F16_F32) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -31,6 +31,10 @@ [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] >; +def AMDGPUFPPackOp : SDTypeProfile<1, 2, + [SDTCisFP<1>, SDTCisSameAs<1, 2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -78,6 +82,8 @@ def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; + def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; // out = max(a, b) a and b are floats, where a nan comparison fails. Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -88,6 +88,7 @@ SDValue Op0, SDValue Op1) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -188,6 +188,7 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -2013,6 +2014,23 @@ Results.push_back(Res); return; } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_cvt_pkrtz: { + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + SDLoc SL(N); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, + Src0, Src1); + + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); + return; + } + default: + break; + } + } default: break; } @@ -2690,10 +2708,6 @@ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case AMDGPUIntrinsic::SI_packf16: - if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) - return DAG.getUNDEF(MVT::i32); - return Op; case Intrinsic::amdgcn_interp_mov: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); SDValue Glue = M0.getValue(1); @@ -2810,6 +2824,18 @@ Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_sffbh: return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_cvt_pkrtz: { + // FIXME: Stop adding cast if v2f16 legal. + EVT VT = Op.getValueType(); + SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32, + Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, Node); + } + case AMDGPUIntrinsic::SI_packf16: { // Legacy name + EVT VT = Op.getValueType(); + return DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -4153,6 +4179,15 @@ return SDValue(); } +SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0.isUndef() && Src1.isUndef()) + return DCI.DAG.getUNDEF(N->getValueType(0)); + return SDValue(); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4421,6 +4456,8 @@ return performCvtF32UByteNCombine(N, DCI); case AMDGPUISD::FMED3: return performFMed3Combine(N, DCI); + case AMDGPUISD::CVT_PKRTZ_F16_F32: + return performCvtPkRTZCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1152,6 +1152,7 @@ def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -354,7 +354,7 @@ defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>; defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>; Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3206,6 +3206,31 @@ return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); } + case Intrinsic::amdgcn_cvt_pkrtz: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + if (const ConstantFP *C0 = dyn_cast(Src0)) { + if (const ConstantFP *C1 = dyn_cast(Src1)) { + const fltSemantics &HalfSem + = II->getType()->getScalarType()->getFltSemantics(); + bool LosesInfo; + APFloat Val0 = C0->getValueAPF(); + APFloat Val1 = C1->getValueAPF(); + Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + + Constant *Folded = ConstantVector::get({ + ConstantFP::get(II->getContext(), Val0), + ConstantFP::get(II->getContext(), Val1) }); + return replaceInstUsesWith(*II, Folded); + } + } + + if (isa(Src0) && isa(Src1)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -15,13 +15,13 @@ %tmp5 = and i32 %tmp2, %tmp4 %tmp6 = icmp eq i32 %tmp5, 0 %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1 - %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7) - %tmp9 = bitcast i32 %tmp8 to float + %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) + %tmp9 = bitcast <2 x half> %tmp8 to float ret float %tmp9 } declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare i32 @llvm.SI.packf16(float, float) #1 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -0,0 +1,163 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: s_load_dword [[Y:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]] +define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]] +define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef: +; GCN-NEXT: ; BB#0 +; GCN-NEXT: s_endpgm +define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] +; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]] +define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0 +define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] +; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]] +define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]] +define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]] +define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]] +define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]] +define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fabs.a = call float @llvm.fabs.f32(float %a) + %neg.fabs.a = fsub float -0.0, %fabs.a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare float @llvm.fabs.f32(float) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -204,11 +204,9 @@ %tmp = call float @llvm.fabs.f32(float %p2.i) %tmp34 = call float @llvm.fabs.f32(float %p2.i12) %tmp35 = call float @llvm.fabs.f32(float %p2.i6) - %tmp36 = call i32 @llvm.SI.packf16(float %tmp, float %tmp34) - %tmp37 = bitcast i32 %tmp36 to <2 x half> - %tmp38 = call i32 @llvm.SI.packf16(float %tmp35, float 1.000000e+00) - %tmp39 = bitcast i32 %tmp38 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 true) #0 + %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34) + %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0 ret void } @@ -218,7 +216,7 @@ declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 -declare i32 @llvm.SI.packf16(float, float) #1 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -164,11 +164,9 @@ %tmp110 = fmul float %tmp109, %tmp106 %tmp111 = fsub float -0.000000e+00, %tmp105 %tmp112 = fmul float %tmp111, %tmp106 - %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110) - %tmp114 = bitcast i32 %tmp113 to <2 x half> - %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00) - %tmp116 = bitcast i32 %tmp115 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp114, <2 x half> %tmp116, i1 true, i1 true) #0 + %tmp113 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp108, float %tmp110) + %tmp115 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp112, float 1.000000e+00) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp113, <2 x half> %tmp115, i1 true, i1 true) #0 ret void } @@ -388,9 +386,8 @@ %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10) - %tmp13 = bitcast i32 %tmp12 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } @@ -404,9 +401,8 @@ %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } @@ -419,8 +415,8 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -declare i32 @llvm.SI.packf16(float, float) #1 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -45,11 +45,9 @@ %tmp33 = extractelement <4 x float> %tmp31, i32 1 %tmp34 = extractelement <4 x float> %tmp31, i32 2 %tmp35 = extractelement <4 x float> %tmp31, i32 3 - %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33) - %tmp37 = bitcast i32 %tmp36 to <2 x half> - %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35) - %tmp39 = bitcast i32 %tmp38 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 false) #0 + %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33) + %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0 ret void } @@ -58,7 +56,7 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare i32 @llvm.SI.packf16(float, float) #1 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -727,11 +727,9 @@ %tmp578 = fadd float %tmp577, %tmp554 %tmp579 = fmul float %tmp574, %tmp45 %tmp580 = fadd float %tmp579, %tmp556 - %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578) - %tmp582 = bitcast i32 %tmp581 to <2 x half> - %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282) - %tmp584 = bitcast i32 %tmp583 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp582, <2 x half> %tmp584, i1 true, i1 true) #0 + %tmp581 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp576, float %tmp578) + %tmp583 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp580, float %tmp282) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp581, <2 x half> %tmp583, i1 true, i1 true) #0 ret void ENDIF66: ; preds = %LOOP65 @@ -1806,11 +1804,9 @@ %tmp774 = fadd float %tmp773, %tmp52 %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00) %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) - %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770) - %tmp777 = bitcast i32 %tmp776 to <2 x half> - %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %clamp.i2) - %tmp779 = bitcast i32 %tmp778 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp777, <2 x half> %tmp779, i1 true, i1 true) #0 + %tmp776 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp768, float %tmp770) + %tmp778 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp772, float %clamp.i2) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp776, <2 x half> %tmp778, i1 true, i1 true) #0 ret void ELSE214: ; preds = %ELSE211 @@ -1842,13 +1838,13 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare i32 @llvm.SI.packf16(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -105,9 +105,8 @@ endif: ; preds = %else, %if %export = phi float [ %lds_data, %if ], [ %interp, %else ] - %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export) - %tmp5 = bitcast i32 %tmp4 to float - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp5, float %tmp5, float %tmp5, float %tmp5, i1 true, i1 true) #0 + %tmp4 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %export, float %export) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp4, <2 x half> %tmp4, i1 true, i1 true) #0 ret void } @@ -207,7 +206,8 @@ declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare i32 @llvm.SI.packf16(float, float) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/split-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/split-smrd.ll +++ test/CodeGen/AMDGPU/split-smrd.ll @@ -23,17 +23,16 @@ %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to <2 x half> - call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare i32 @llvm.SI.packf16(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/Transforms/InstCombine/amdgcn-intrinsics.ll =================================================================== --- test/Transforms/InstCombine/amdgcn-intrinsics.ll +++ test/Transforms/InstCombine/amdgcn-intrinsics.ll @@ -633,3 +633,73 @@ %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg) ret float %cos } + +; -------------------------------------------------------------------- +; llvm.amdgcn.cvt.pkrtz +; -------------------------------------------------------------------- + +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone + +; CHECK-LABEL: @vars_lhs_cvt_pkrtz( +; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) +define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @constant_lhs_cvt_pkrtz( +; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y) +define <2 x half> @constant_lhs_cvt_pkrtz(float %y) { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @constant_rhs_cvt_pkrtz( +; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00) +define <2 x half> @constant_rhs_cvt_pkrtz(float %x) { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @undef_lhs_cvt_pkrtz( +; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y) +define <2 x half> @undef_lhs_cvt_pkrtz(float %y) { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @undef_rhs_cvt_pkrtz( +; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef) +define <2 x half> @undef_rhs_cvt_pkrtz(float %x) { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @undef_cvt_pkrtz( +; CHECK: ret <2 x half> undef +define <2 x half> @undef_cvt_pkrtz() { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @constant_splat0_cvt_pkrtz( +; CHECK: ret <2 x half> zeroinitializer +define <2 x half> @constant_splat0_cvt_pkrtz() { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0) + ret <2 x half> %cvt +} + +; CHECK-LABEL: @constant_cvt_pkrtz( +; CHECK: ret <2 x half> +define <2 x half> @constant_cvt_pkrtz() { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0) + ret <2 x half> %cvt +} + +; Test constant values where rtz changes result +; CHECK-LABEL: @constant_rtz_pkrtz( +; CHECK: ret <2 x half> +define <2 x half> @constant_rtz_pkrtz() { + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0) + ret <2 x half> %cvt +}