Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -170,7 +170,7 @@ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_amdgcn_ldexp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty], [IntrNoMem] >; def int_amdgcn_frexp_mant : Intrinsic< @@ -178,7 +178,7 @@ >; def int_amdgcn_frexp_exp : Intrinsic< - [llvm_i32_ty], [llvm_anyfloat_ty], [IntrNoMem] + [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem] >; // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -190,6 +190,12 @@ // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP16Denormals", + "true", + "Enable half precision denormal handling" +>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,7 @@ field bits<32> Inst = 0xffffffff; } +def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -72,6 +72,7 @@ bool HalfRate64Ops; // Dynamially set bits that enable features. + bool FP16Denormals; bool FP32Denormals; bool FP64Denormals; bool FPExceptions; @@ -262,6 +263,10 @@ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + bool hasFP16Denormals() const { + return FP16Denormals; + } + bool hasFP32Denormals() const { return FP32Denormals; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -56,6 +56,7 @@ // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP16Denormals = false; FP32Denormals = false; FP64Denormals = false; } @@ -81,6 +82,7 @@ FastFMAF32(false), HalfRate64Ops(false), + FP16Denormals(false), FP32Denormals(false), FP64Denormals(false), FPExceptions(false), Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -46,6 +46,12 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + /// \brief Custom lowering for ISD::SINT_TO_FP, ISD::UINT_TO_FP. + SDValue LowerIntToFp(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Custom lowering for ISD::ConstantFP. + SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -80,8 +80,11 @@ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); // TODO: Subtarget feature for i16 - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::VGPR_32RegClass); + } computeRegisterProperties(STI.getRegisterInfo()); @@ -269,15 +272,35 @@ setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); + + // F16 - Constant Actions. + setOperationAction(ISD::ConstantFP, MVT::f16, Custom); + + // F16 - Load/Store Actions. + setOperationAction(ISD::LOAD, MVT::f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); + setOperationAction(ISD::STORE, MVT::f16, Promote); + AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); + + // F16 - VOP1 Actions. + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + + // F16 - VOP2 Actions. + setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + + // F16 - VOP3 Actions. + setOperationAction(ISD::FMA, MVT::f16, Legal); + if (!Subtarget->hasFP16Denormals()) + setOperationAction(ISD::FMAD, MVT::f16, Legal); } setTargetDAGCombine(ISD::FADD); @@ -1800,6 +1823,13 @@ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); + + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return LowerIntToFp(Op, DAG); + + case ISD::ConstantFP: + return LowerConstantFP(Op, DAG); } return SDValue(); } @@ -1994,6 +2024,29 @@ return Chain; } +SDValue SITargetLowering::LowerIntToFp(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType() == MVT::i64) + return Op.getOpcode() == ISD::SINT_TO_FP ? + AMDGPUTargetLowering::LowerSINT_TO_FP(Op, DAG) : + AMDGPUTargetLowering::LowerUINT_TO_FP(Op, DAG); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f16) + return Op; + + SDValue SExtOrZext = Op.getOpcode() == ISD::SINT_TO_FP ? + DAG.getSExtOrTrunc(Op.getOperand(0), SDLoc(Op), MVT::i32) : + DAG.getZExtOrTrunc(Op.getOperand(0), SDLoc(Op), MVT::i32); + return DAG.getNode(Op.getOpcode(), SDLoc(Op), DestVT, SExtOrZext); +} + +SDValue SITargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG) const { + if (ConstantFPSDNode *FP = dyn_cast(Op)) + return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(Op), MVT::i32); + return SDValue(); +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, SelectionDAG &DAG) const { SDLoc SL; @@ -3546,7 +3599,7 @@ SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); - if (VT != MVT::f32 && VT != MVT::f64) + if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Match isinf pattern Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1412,7 +1412,9 @@ return true; } - if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + bool IsF16 = Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64; // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || @@ -1433,7 +1435,7 @@ MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); - // Multiplied part is the constant: Use v_madmk_f32 + // Multiplied part is the constant: Use v_madmk_{f16|f32} // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) @@ -1461,7 +1463,7 @@ Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_MAC_F32_e64) { UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } @@ -1469,7 +1471,7 @@ Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); + UseMI.setDesc(get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1478,7 +1480,7 @@ return true; } - // Added part is the constant: Use v_madak_f32 + // Added part is the constant: Use v_madak_{f16|f32} if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. @@ -1500,7 +1502,7 @@ UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_MAC_F32_e64) { UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } @@ -1510,7 +1512,7 @@ // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); + UseMI.setDesc(get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1619,12 +1621,17 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + bool IsF16 = false; switch (MI.getOpcode()) { default: return nullptr; + case AMDGPU::V_MAC_F16_e64: + IsF16 = true; case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F16_e32: + IsF16 = true; case AMDGPU::V_MAC_F32_e32: { const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); if (Src0->isImm() && !isInlineConstant(*Src0, 4)) @@ -1638,7 +1645,8 @@ const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -933,14 +933,12 @@ let HasExt = 0; } -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -414,6 +414,16 @@ } // End Predicates = [UnsafeFPMath] +def : Pat < + (f32 (fpextend f16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat < + (f16 (fpround f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// @@ -428,11 +438,20 @@ (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F16 +def : Pat < + (f16 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers))), + (V_MAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + // Pattern for V_MAC_F32 def : Pat < - (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods f32:$src1, i32:$src1_modifiers), - (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (f32 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers))), (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, $clamp, $omod) >; @@ -507,6 +526,12 @@ // FIXME: Why do only some of these type combinations for SReg and // VReg? +// 16-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + // 32-bit bitcast def : BitConvert ; def : BitConvert ; @@ -832,6 +857,11 @@ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) >; +def : Pat < + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + class FPToI1Pat : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,8 +190,8 @@ (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -// i16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +// i16/f16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -259,7 +259,7 @@ } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { let AllocationPriority = 1; } @@ -347,7 +347,8 @@ let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add VGPR_32, SReg_32)> { let isAllocatable = 0; } Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -26,6 +26,7 @@ def WriteBarrier : SchedWrite; // Vector ALU instructions +def Write16Bit : SchedWrite; def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; def WriteFullOrQuarterRate32 : SchedWrite; @@ -101,6 +102,7 @@ def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? + def : HWVALUWriteRes; // XXX: Look this up. def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -109,6 +111,7 @@ def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; def WriteCopy : SchedWriteVariant<[ + SchedVar, SchedVar, SchedVar, SchedVar]>; Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -90,6 +90,7 @@ switch (MI.getOpcode()) { default: return false; + case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_MAC_F32_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -280,24 +280,24 @@ let SubtargetPredicate = isVI in { -defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16>; -defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16>; -defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>; +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; +defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; +defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; +defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; +defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; +defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>; +defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; +defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; +defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; +defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; +defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -322,25 +322,29 @@ defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_B16 : VOP2Inst <"v_ashrrev_b16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16>; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16, AMDGPUldexp>; let isCommutable = 1 in { -defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16>; +defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; +defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; -defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_F16_F16_F16>; +defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK>; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16>; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; + +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC>; +} } // End isCommutable = 1 } // End SubtargetPredicate = isVI Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -215,10 +215,14 @@ let SubtargetPredicate = isVI in { let isCommutable = 1 in { - def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile>; - def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; - def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; -} + +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; +def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; +def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; + +} // End isCommutable = 1 + } // End SubtargetPredicate = isVI @@ -414,6 +418,8 @@ defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; +defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; + defm V_ADD_F64 : VOP3_Real_vi <0x280>; defm V_MUL_F64 : VOP3_Real_vi <0x281>; defm V_MIN_F64 : VOP3_Real_vi <0x282>; Index: lib/Target/AMDGPU/VOPCInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPCInstructions.td +++ lib/Target/AMDGPU/VOPCInstructions.td @@ -144,11 +144,15 @@ } } +def VOPC_I1_F16_F16 : VOPC_Profile<[Write16Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; +multiclass VOPC_F16 : + VOPC_Pseudos ; + multiclass VOPC_F32 : VOPC_Pseudos ; @@ -161,6 +165,9 @@ multiclass VOPC_I64 : VOPC_Pseudos ; +multiclass VOPCX_F16 : + VOPC_Pseudos ; + multiclass VOPCX_F32 : VOPC_Pseudos ; @@ -318,6 +325,44 @@ } // End SubtargetPredicate = isSICI +let SubtargetPredicate = isVI in { + +defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">; +defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">; +defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>; +defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">; +defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>; +defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>; +defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>; +defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>; +defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>; +defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">; +defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>; +defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">; +defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>; +defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>; +defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>; +defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">; + +defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">; +defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">; +defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">; +defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">; +defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">; +defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">; +defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">; +defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">; +defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">; +defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">; +defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">; + +} // End SubtargetPredicate = isVI + defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">; defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">; @@ -429,9 +474,16 @@ } } +def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write16Bit], f16>; def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; +multiclass VOPC_CLASS_F16 : + VOPC_Class_Pseudos ; + +multiclass VOPCX_CLASS_F16 : + VOPC_Class_Pseudos ; + multiclass VOPC_CLASS_F32 : VOPC_Class_Pseudos ; @@ -449,6 +501,9 @@ defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; +defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; +defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; + //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. //===----------------------------------------------------------------------===// @@ -810,147 +865,183 @@ } } -defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; -defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; -defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; -defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; -defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; -defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; -defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; -defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; -defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; -defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; -defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; -defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; -defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; -defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; -defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; -defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; - -defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; -defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; -defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; -defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; -defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; -defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; -defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; -defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; -defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; -defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; -defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; -defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; -defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; -defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; -defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; -defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; - -defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; -defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; -defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; -defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; -defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; -defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; -defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; -defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; -defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; -defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; -defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; -defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; -defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; -defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; -defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; -defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; - -defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; -defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; -defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; -defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; -defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; -defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; -defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; -defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; -defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; -defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; -defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; -defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; -defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; -defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; -defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; -defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; - -defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; -defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; -defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; -defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; -defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; -defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; -defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; -defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; - -defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; -defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; -defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; -defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; -defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; -defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; -defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; -defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; - -defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; -defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; -defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; -defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; -defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; -defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; -defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; -defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; - -defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; -defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; -defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; -defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; -defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; -defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; -defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; -defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; - -defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; -defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; -defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; -defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; -defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; -defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; -defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; -defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; - -defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; -defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; -defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; -defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; -defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; -defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; -defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; -defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; - -defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; -defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; -defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; -defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; -defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; -defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; -defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; -defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; - -defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; -defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; -defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; -defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; -defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; -defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; -defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; -defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; - defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>; defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>; defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>; defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>; +defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x014>; +defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x015>; + +defm V_CMP_F_F16 : VOPC_Real_vi <0x020>; +defm V_CMP_LT_F16 : VOPC_Real_vi <0x021>; +defm V_CMP_EQ_F16 : VOPC_Real_vi <0x022>; +defm V_CMP_LE_F16 : VOPC_Real_vi <0x023>; +defm V_CMP_GT_F16 : VOPC_Real_vi <0x024>; +defm V_CMP_LG_F16 : VOPC_Real_vi <0x025>; +defm V_CMP_GE_F16 : VOPC_Real_vi <0x026>; +defm V_CMP_O_F16 : VOPC_Real_vi <0x027>; +defm V_CMP_U_F16 : VOPC_Real_vi <0x028>; +defm V_CMP_NGE_F16 : VOPC_Real_vi <0x029>; +defm V_CMP_NLG_F16 : VOPC_Real_vi <0x02a>; +defm V_CMP_NGT_F16 : VOPC_Real_vi <0x02b>; +defm V_CMP_NLE_F16 : VOPC_Real_vi <0x02c>; +defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x02d>; +defm V_CMP_NLT_F16 : VOPC_Real_vi <0x02e>; +defm V_CMP_TRU_F16 : VOPC_Real_vi <0x02f>; + +defm V_CMPX_F_F16 : VOPC_Real_vi <0x030>; +defm V_CMPX_LT_F16 : VOPC_Real_vi <0x031>; +defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x032>; +defm V_CMPX_LE_F16 : VOPC_Real_vi <0x033>; +defm V_CMPX_GT_F16 : VOPC_Real_vi <0x034>; +defm V_CMPX_LG_F16 : VOPC_Real_vi <0x035>; +defm V_CMPX_GE_F16 : VOPC_Real_vi <0x036>; +defm V_CMPX_O_F16 : VOPC_Real_vi <0x037>; +defm V_CMPX_U_F16 : VOPC_Real_vi <0x038>; +defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x039>; +defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x03a>; +defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x03b>; +defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x03c>; +defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x03d>; +defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x03e>; +defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x03f>; + +defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; +defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; +defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; +defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; +defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; +defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; +defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; +defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; +defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; +defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; +defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; +defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; +defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; +defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; +defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; +defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; + +defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; +defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; +defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; +defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; +defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; +defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; +defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; +defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; +defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; +defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; +defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; +defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; +defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; +defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; +defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; +defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; + +defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; +defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; +defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; +defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; +defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; +defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; +defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; +defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; +defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; +defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; +defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; +defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; +defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; +defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; +defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; +defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; + +defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; +defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; +defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; +defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; +defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; +defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; +defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; +defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; +defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; +defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; +defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; +defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; +defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; +defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; +defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; +defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; + +defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; +defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; +defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; +defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; +defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; +defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; +defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; +defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; + +defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; +defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; +defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; +defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; +defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; +defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; +defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; +defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; + +defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; +defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; +defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; +defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; +defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; +defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; +defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; +defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; + +defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; +defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; +defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; +defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; +defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; +defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; +defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; +defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; + +defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; +defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; +defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; +defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; +defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; +defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; +defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; +defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; + +defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; +defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; +defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; +defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; +defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; +defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; +defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; +defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; + +defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; +defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; +defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; +defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; +defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; +defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; +defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; +defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; + +defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; +defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; +defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; +defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; +defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; +defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; +defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; +defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; Index: test/CodeGen/AMDGPU/amdgcn-vop1-f16-vi.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-vop1-f16-vi.ll @@ -0,0 +1,202 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; Tests: +; test_simple_vt_cos_{0, 1, 2} ; v_cos_f16 +; test_simple_vt_fract_{0, 1, 2} ; v_fract_f16 +; test_simple_vt_frexp_exp_{0, 1, 2} ; v_frexp_exp_i16_f16 +; test_simple_vt_frexp_mant_{0, 1, 2} ; v_frexp_mant_f16 +; test_simple_vt_rcp_{0, 1, 2} ; v_rcp_f16 +; test_simple_vt_rsq_{0, 1, 2} ; v_rsq_f16 +; test_simple_vt_sin_{0, 1, 2} ; v_sin_f16 + +declare i32 @llvm.amdgcn.workitem.id.x() + +declare half @llvm.amdgcn.cos.f16(half %a) +declare half @llvm.amdgcn.fract.f16(half %a) +declare i16 @llvm.amdgcn.frexp.exp.f16(half %a) +declare half @llvm.amdgcn.frexp.mant.f16(half %a) +declare half @llvm.amdgcn.rcp.f16(half %a) +declare half @llvm.amdgcn.rsq.f16(half %a) +declare half @llvm.amdgcn.sin.f16(half %a) + +; GCN-LABEL: {{^}}test_simple_vt_cos_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_cos_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cos_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_cos_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_fract_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_fract_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.fract.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_fract_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_fract_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.fract.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_frexp_exp_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_I16]] +define void @test_simple_vt_frexp_exp_1(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call i16 @llvm.amdgcn.frexp.exp.f16(half %a.val) + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_frexp_exp_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_I16]] +define void @test_simple_vt_frexp_exp_2(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call i16 @llvm.amdgcn.frexp.exp.f16(half %a.val) + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_frexp_mant_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_frexp_mant_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.frexp.mant.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_frexp_mant_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_frexp_mant_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.frexp.mant.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rcp_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_rcp_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.rcp.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rcp_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_rcp_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.rcp.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rsq_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_rsq_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.rsq.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rsq_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_rsq_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.rsq.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sin_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_sin_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sin_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; VI: v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_sin_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.amdgcn.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/amdgcn-vop1-f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-vop1-f16.ll @@ -0,0 +1,495 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; Tests: +; test_simple_vt_ceil_{0, 1, 2} ; v_ceil_f16 +; test_simple_vt_cos_{0, 1, 2} ; v_cos_f32 +; test_simple_vt_cvt_float_to_half_{0, 1, 2} ; v_cvt_f16_f32 +; test_simple_vt_cvt_signed_short_to_half_{0, 1, 2} ; v_cvt_f16_i16 +; test_simple_vt_cvt_unsigned_short_to_half_{0, 1, 2} ; v_cvt_f16_u16 +; test_simple_vt_cvt_half_to_float_{0, 1, 2} ; v_cvt_f32_f16 +; test_simple_vt_cvt_half_to_signed_short_{0, 1, 2} ; v_cvt_i16_f16 +; test_simple_vt_cvt_half_to_unsigned_short_{0, 1, 2} ; v_cvt_u16_f16 +; test_simple_vt_exp_{0, 1, 2} ; v_exp_f16 +; test_simple_vt_floor_{0, 1, 2} ; v_floor_f16 +; test_simple_vt_log_{0, 1, 2} ; v_log_f16 +; test_simple_vt_rndne_{0, 1, 2} ; v_rndne_f16 +; test_simple_vt_sin_{0, 1, 2} ; v_sin_f32 +; test_simple_vt_sqrt_{0, 1, 2} ; v_sqrt_f16 +; test_simple_vt_trunc_{0, 1, 2} ; v_trunc_f16 + +declare i32 @llvm.amdgcn.workitem.id.x() + +declare half @llvm.ceil.f16(half %a) +declare half @llvm.cos.f16(half %a) +declare half @llvm.exp2.f16(half %a) +declare half @llvm.floor.f16(half %a) +declare half @llvm.log2.f16(half %a) +declare half @llvm.rint.f16(half %a) +declare half @llvm.sin.f16(half %a) +declare half @llvm.sqrt.f16(half %a) +declare half @llvm.trunc.f16(half %a) + +; GCN-LABEL: {{^}}test_simple_vt_ceil_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_ceil_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.ceil.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_ceil_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_ceil_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.ceil.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cos_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], 0x3e22f983, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_cos_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_cos_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cos_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], 0x3e22f983, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_cos_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_cos_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_float_to_half_1 +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_cvt_float_to_half_1(half addrspace(1)* %r, + float addrspace(1)* %a) { + %a.val = load float, float addrspace(1)* %a + %r.val = fptrunc float %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_float_to_half_2 +; GCN: {{buffer|flat}}_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_cvt_float_to_half_2(half addrspace(1)* %r, + float addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr float, float addrspace(1)* %a, i32 %tid.x + %a.val = load float, float addrspace(1)* %a.ptr + %r.val = fptrunc float %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_signed_short_to_half_1 +; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_i16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_cvt_signed_short_to_half_1(half addrspace(1)* %r, + i16 addrspace(1)* %a) { + %a.val = load i16, i16 addrspace(1)* %a + %r.val = sitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_signed_short_to_half_2 +; GCN: {{buffer|flat}}_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_i16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_cvt_signed_short_to_half_2(half addrspace(1)* %r, + i16 addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x + %a.val = load i16, i16 addrspace(1)* %a.ptr + %r.val = sitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_unsigned_short_to_half_1 +; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_cvt_unsigned_short_to_half_1(half addrspace(1)* %r, + i16 addrspace(1)* %a) { + %a.val = load i16, i16 addrspace(1)* %a + %r.val = uitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_unsigned_short_to_half_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_cvt_unsigned_short_to_half_2(half addrspace(1)* %r, + i16 addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x + %a.val = load i16, i16 addrspace(1)* %a.ptr + %r.val = uitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_float_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_dword v[[R_F32]] +define void @test_simple_vt_cvt_half_to_float_1(float addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_float_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_dword v[[R_F32]] +define void @test_simple_vt_cvt_half_to_float_2(float addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_signed_short_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_I16]] +define void @test_simple_vt_cvt_half_to_signed_short_1(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = fptosi half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_signed_short_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_I16]] +define void @test_simple_vt_cvt_half_to_signed_short_2(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = fptosi half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_unsigned_short_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_I16]] +define void @test_simple_vt_cvt_half_to_unsigned_short_1(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = fptoui half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_cvt_half_to_unsigned_short_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_I16]] +define void @test_simple_vt_cvt_half_to_unsigned_short_2(i16 addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = fptoui half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_exp_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_exp_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_exp_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.exp2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_exp_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_exp_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_exp_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.exp2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_floor_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_floor_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_floor_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.floor.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_floor_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_floor_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_floor_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.floor.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_log_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_log_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_log_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.log2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_log_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_log_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_log_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.log2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rndne_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_rndne_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.rint.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_rndne_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_rndne_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.rint.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sin_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], 0x3e22f983, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_sin_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_sin_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sin_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], 0x3e22f983, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_sin_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_sin_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sqrt_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_sqrt_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_sqrt_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.sqrt.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sqrt_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_sqrt_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_sqrt_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.sqrt.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_trunc_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_trunc_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_trunc_1(half addrspace(1)* %r, + half addrspace(1)* %a) { + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.trunc.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_trunc_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_trunc_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_trunc_2(half addrspace(1)* %r, + half addrspace(1)* %a) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %a.val = load half, half addrspace(1)* %a.ptr + %r.val = call half @llvm.trunc.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/amdgcn-vop2-f16-vi.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-vop2-f16-vi.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; Tests: +; test_simple_vt_ldexp_{0, 1, 2} ; v_ldexp_f16 + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() + +declare half @llvm.amdgcn.ldexp.f16(half %a, i16 %b) + +; GCN-LABEL: {{^}}test_simple_vt_ldexp_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_I16:[0-9]+]] +; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_ldexp_1(half addrspace(1)* %r, + half addrspace(1)* %a, + i16 addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load i16, i16 addrspace(1)* %b + %r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i16 %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_ldexp_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_I16:[0-9]+]] +; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_ldexp_2(half addrspace(1)* %r, + half addrspace(1)* %a, + i16 addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr i16, i16 addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load i16, i16 addrspace(1)* %b.ptr + %r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i16 %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/amdgcn-vop2-f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-vop2-f16.ll @@ -0,0 +1,218 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() + +declare half @llvm.maxnum.f16(half %a, half %b) +declare half @llvm.minnum.f16(half %a, half %b) + +; GCN-LABEL: {{^}}test_simple_vt_add_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_add_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_add_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_add_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_max_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_max_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_max_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_max_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_min_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_min_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_min_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_min_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_mul_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_mul_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fmul half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_mul_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_mul_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %r.val = fmul half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sub_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_sub_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fsub half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_sub_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_sub_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %r.val = fsub half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/amdgcn-vop3-f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-vop3-f16.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.workitem.id.z() + +declare half @llvm.fma.f16(half %a, half %b, half %c) + +; GCN-LABEL: {{^}}test_simple_vt_fma_1 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]], v[[C_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +define void @test_simple_vt_fma_1(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c){ + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_simple_vt_fma_2 +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]], v[[C_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; GCN: {{buffer|flat}}_store_short v[[R_F16]] +define void @test_simple_vt_fma_2(half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c){ + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %tid.z = call i32 @llvm.amdgcn.workitem.id.z() + %a.ptr = getelementptr half, half addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr half, half addrspace(1)* %b, i32 %tid.y + %c.ptr = getelementptr half, half addrspace(1)* %c, i32 %tid.z + %a.val = load half, half addrspace(1)* %a.ptr + %b.val = load half, half addrspace(1)* %b.ptr + %c.val = load half, half addrspace(1)* %c.ptr + %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ /dev/null @@ -1,636 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; half args should be promoted to float - -; GCN-LABEL: {{^}}load_f16_arg: -; GCN: s_load_dword [[ARG:s[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] -; GCN: buffer_store_short [[CVT]] -define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { - store half %arg, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] -; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: s_endpgm -define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { - store <2 x half> %arg, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_short -; GCN-NOT: buffer_store -; GCN: s_endpgm -define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { - store <3 x half> %arg, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm -define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { - store <4 x half> %arg, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v8f16_arg: -define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { - store <8 x half> %arg, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_arg: -define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { - %fpext = fpext <2 x half> %in to <2 x float> - store <2 x float> %fpext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f32_arg: -define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to float - store float %ext, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: -define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x float> - store <2 x float> %ext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16 -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_dwordx2 -; GCN: s_endpgm -define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x float> - store <3 x float> %ext, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: -define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x float> - store <4 x float> %ext, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort - -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 - -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 -define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x float> - store <8 x float> %ext, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f64_arg: -; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} -; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} -; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] -; GCN: buffer_store_dwordx2 [[RESULT]] -define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to double - store double %ext, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN: s_endpgm -define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x double> - store <2 x double> %ext, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN: s_endpgm -define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x double> - store <3 x double> %ext, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN: s_endpgm -define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x double> - store <4 x double> %ext, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v - -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v - -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 - -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 - -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 - -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 - -; GCN: s_endpgm -define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x double> - store <8 x double> %ext, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_f16: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - store half %val, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v2f16: -; GCN: buffer_load_dword [[TMP:v[0-9]+]] -; GCN: buffer_store_dword [[TMP]] -define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - store <2 x half> %val, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v4f16: -; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx2 [[TMP]] -define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - store <4 x half> %val, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v8f16: -; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: s_endpgm -define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - store <8 x half> %val, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f32: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_dword [[CVT]] -define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to float - store float %cvt, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] -; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} -; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x float> - store <2 x float> %cvt, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: -define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x float> - store <3 x float> %cvt, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: -define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: -define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x float> - store <8 x float> %cvt, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 - -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 - -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 - -; GCN: s_endpgm -define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x float> - store <16 x float> %cvt, <16 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f64: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] -; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] -; GCN: buffer_store_dwordx2 [[CVT1]] -define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to double - store double %cvt, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} -; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x double> - store <2 x double> %cvt, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: - -; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XSI: v_cvt_f32_f16_e32 -; XSI: v_cvt_f32_f16_e32 -; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XSI: v_cvt_f32_f16_e32 -; XSI-NOT: v_cvt_f32_f16 - -; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XVI: v_cvt_f32_f16_e32 -; XVI: v_cvt_f32_f16_e32 -; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XVI: v_cvt_f32_f16_e32 -; XVI-NOT: v_cvt_f32_f16 - -; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] - -; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] -; GCN-NOT: v_cvt_f64_f32_e32 - -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 -; GCN: s_endpgm -define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x double> - store <3 x double> %cvt, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: -define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x double> - store <4 x double> %cvt, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: -define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x double> - store <8 x double> %cvt, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: -define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x double> - store <16 x double> %cvt, <16 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_short [[CVT]] -define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { - %val = load float, float addrspace(1)* %in - %cvt = fptrunc float %val to half - store half %cvt, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: -; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] -; GCN-DAG: buffer_store_dword [[PACKED]] -; GCN: s_endpgm -define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { - %val = load <2 x float>, <2 x float> addrspace(1)* %in - %cvt = fptrunc <2 x float> %val to <2 x half> - store <2 x half> %cvt, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-NOT: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { - %val = load <3 x float>, <3 x float> addrspace(1)* %in - %cvt = fptrunc <3 x float> %val to <3 x half> - store <3 x half> %cvt, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm -define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { - %val = load <4 x float>, <4 x float> addrspace(1)* %in - %cvt = fptrunc <4 x float> %val to <4 x half> - store <4 x half> %cvt, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_dwordx4 -; GCN: s_endpgm -define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { - %val = load <8 x float>, <8 x float> addrspace(1)* %in - %cvt = fptrunc <8 x float> %val to <8 x half> - store <8 x half> %cvt, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN: buffer_load_dwordx4 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: buffer_store_dwordx4 -; GCN-DAG: buffer_store_dwordx4 -; GCN: s_endpgm -define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { - %val = load <16 x float>, <16 x float> addrspace(1)* %in - %cvt = fptrunc <16 x float> %val to <16 x half> - store <16 x half> %cvt, <16 x half> addrspace(1)* %out - ret void -} - -; FIXME: Unsafe math should fold conversions away -; GCN-LABEL: {{^}}fadd_f16: -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { - %add = fadd half %a, %b - store half %add, half addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}fadd_v2f16: -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { - %add = fadd <2 x half> %a, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}fadd_v4f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 - %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 - %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 - %result = fadd <4 x half> %a, %b - store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 - ret void -} - -; GCN-LABEL: {{^}}fadd_v8f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { - %add = fadd <8 x half> %a, %b - store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 - ret void -} - -; GCN-LABEL: {{^}}fsub_f16: -; GCN: v_subrev_f32_e32 -; GCN: s_endpgm -define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 - %a = load half, half addrspace(1)* %in - %b = load half, half addrspace(1)* %b_ptr - %sub = fsub half %a, %b - store half %sub, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_from_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { - %val = load half, half addrspace(1)* %in - %val_int = bitcast half %val to i16 - store i16 %val_int, i16 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_to_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { - %val = load i16, i16 addrspace(1)* %in - %val_fp = bitcast i16 %val to half - store half %val_fp, half addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind }