Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -187,6 +187,12 @@ // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP16Denormals", + "true", + "Enable half precision denormal handling" +>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -563,7 +563,8 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && + VT == MVT::f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,7 @@ field bits<32> Inst = 0xffffffff; } +def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ bool HalfRate64Ops; // Dynamially set bits that enable features. + bool FP16Denormals; bool FP32Denormals; bool FP64Denormals; bool FPExceptions; @@ -270,6 +271,9 @@ /// the given LDS memory size is the only constraint. unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + bool hasFP16Denormals() const { + return FP16Denormals; + } bool hasFP32Denormals() const { return FP32Denormals; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -56,6 +56,7 @@ // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP16Denormals = false; FP32Denormals = false; FP64Denormals = false; } @@ -81,6 +82,7 @@ FastFMAF32(false), HalfRate64Ops(false), + FP16Denormals(false), FP32Denormals(false), FP64Denormals(false), FPExceptions(false), Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1368,10 +1368,11 @@ getForcedEncodingSize() != 64) return Match_PreferE32; - if (Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa || - Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) { // v_mac_f32/16 allow only dst_sel == DWORD; - auto OpNum = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel); + auto OpNum = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel); const auto &Op = Inst.getOperand(OpNum); if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) { return Match_InvalidOperand; @@ -2714,14 +2715,20 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si || - Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi) { + Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || + Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(AMDGPU::V_MAC_F32_e64, AMDGPU::OpName::src2_modifiers)); + std::advance( + it, + AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ? + AMDGPU::V_MAC_F16_e64 : + AMDGPU::V_MAC_F32_e64, + AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 ++it; Inst.insert(it, Inst.getOperand(0)); // src2 = dst @@ -2896,11 +2903,13 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || + Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } } @@ -3040,11 +3049,13 @@ } } - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -156,13 +156,15 @@ const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_f32_e64 if we are trying to fold into src2 + // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_MAC_F32_e64 && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - // Check if changing this to a v_mad_f32 instruction will allow us to - // fold the operand. - MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + + // Check if changing this to a v_mad_{f16, f32} instruction will allow us + // to fold the operand. + MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); @@ -239,10 +241,10 @@ // make sense. e.g. don't fold: // // %vreg1 = COPY %vreg0:sub1 - // %vreg2 = V_MAC_F32 %vreg3, %vreg4, %vreg1 + // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1 // // into - // %vreg2 = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1 + // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) return; } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -46,6 +46,19 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + /// \brief Converts \p Op, which must be of floating point type, to the + /// floating point type \p VT, by either extending or truncating it. + SDValue GetFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const; + + /// \brief Custom lowering for ISD::ConstantFP. + SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Custom lowering for ISD::SINT_TO_FP, ISD::UINT_TO_FP. + SDValue LowerIntToFp(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -78,8 +78,10 @@ addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + } computeRegisterProperties(STI.getRegisterInfo()); @@ -263,14 +265,36 @@ setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); + + // F16 - Constant Actions. + setOperationAction(ISD::ConstantFP, MVT::f16, Custom); + + // F16 - Load/Store Actions. + setOperationAction(ISD::LOAD, MVT::f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); + setOperationAction(ISD::STORE, MVT::f16, Promote); + AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); + + // F16 - VOP1 Actions. + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + + // F16 - VOP2 Actions. + setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + + // F16 - VOP3 Actions. + setOperationAction(ISD::FMA, MVT::f16, Legal); + if (!Subtarget->hasFP16Denormals()) + setOperationAction(ISD::FMAD, MVT::f16, Legal); } setTargetDAGCombine(ISD::FADD); @@ -635,6 +659,7 @@ return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } + SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -653,7 +678,7 @@ SDValue Val; if (MemVT.isFloatingPoint()) - Val = DAG.getNode(ISD::FP_EXTEND, SL, VT, Load); + Val = GetFPExtOrFPTrunc(DAG, Load, SL, VT); else if (Signed) Val = DAG.getSExtOrTrunc(Load, SL, VT); else @@ -1796,6 +1821,12 @@ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); + + case ISD::ConstantFP: + return LowerConstantFP(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return LowerIntToFp(Op, DAG); } return SDValue(); } @@ -1989,6 +2020,39 @@ return Chain; } +SDValue SITargetLowering::GetFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const { + return Op.getValueType().bitsLE(VT) ? + DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : + DAG.getNode(ISD::FTRUNC, DL, VT, Op); +} + +SDValue SITargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG) const { + if (ConstantFPSDNode *FP = dyn_cast(Op)) + return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(Op), MVT::i32); + + return SDValue(); +} + +SDValue SITargetLowering::LowerIntToFp(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType() == MVT::i64) + return Op.getOpcode() == ISD::SINT_TO_FP ? + AMDGPUTargetLowering::LowerSINT_TO_FP(Op, DAG) : + AMDGPUTargetLowering::LowerUINT_TO_FP(Op, DAG); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f16) + return Op; + + SDValue SExtOrZExtOrTrunc = Op.getOpcode() == ISD::SINT_TO_FP ? + DAG.getSExtOrTrunc(Op.getOperand(0), SDLoc(Op), MVT::i32) : + DAG.getZExtOrTrunc(Op.getOperand(0), SDLoc(Op), MVT::i32); + return DAG.getNode(Op.getOpcode(), SDLoc(Op), DestVT, SExtOrZExtOrTrunc); +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, SelectionDAG &DAG) const { SDLoc SL; @@ -3556,7 +3620,8 @@ SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); - if (VT != MVT::f32 && VT != MVT::f64) + if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && + VT != MVT::f16)) return SDValue(); // Match isinf pattern @@ -3700,8 +3765,7 @@ // // Only do this if we are not trying to support denormals. v_mad_f32 does // not support denormals ever. - if (VT == MVT::f32 && - !Subtarget->hasFP32Denormals()) { + if (VT == MVT::f32) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getOpcode() == ISD::FADD) { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1445,7 +1445,10 @@ return true; } - if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || @@ -1466,7 +1469,7 @@ MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); - // Multiplied part is the constant: Use v_madmk_f32 + // Multiplied part is the constant: Use v_madmk_{f16, f32}. // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) @@ -1494,15 +1497,15 @@ Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1511,7 +1514,7 @@ return true; } - // Added part is the constant: Use v_madak_f32 + // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. @@ -1533,17 +1536,17 @@ UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1652,12 +1655,17 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + bool IsF16 = false; switch (MI.getOpcode()) { default: return nullptr; + case AMDGPU::V_MAC_F16_e64: + IsF16 = true; case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F16_e32: + IsF16 = true; case AMDGPU::V_MAC_F32_e32: { const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); if (Src0->isImm() && !isInlineConstant(*Src0, 4)) @@ -1671,7 +1679,8 @@ const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -939,14 +939,12 @@ let HasExt = 0; } -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; @@ -964,6 +962,7 @@ def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; @@ -976,6 +975,8 @@ def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>; +def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -413,6 +413,26 @@ } // End Predicates = [UnsafeFPMath] +def : Pat < + (f16 (fpround f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + +def : Pat < + (f16 (fpround f64:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src)) +>; + +def : Pat < + (f32 (fpextend f16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat < + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// @@ -427,11 +447,20 @@ (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F16 +def : Pat < + (f16 (fmad (VOP3NoMods0 f16:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f16:$src1, i32:$src1_modifiers), + (VOP3NoMods f16:$src2, i32:$src2_modifiers))), + (V_MAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + // Pattern for V_MAC_F32 def : Pat < - (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods f32:$src1, i32:$src1_modifiers), - (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (f32 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers))), (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, $clamp, $omod) >; @@ -506,6 +535,12 @@ // FIXME: Why do only some of these type combinations for SReg and // VReg? +// 16-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + // 32-bit bitcast def : BitConvert ; def : BitConvert ; Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,8 +190,7 @@ (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -// i16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -259,7 +258,7 @@ } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { let AllocationPriority = 1; } @@ -347,7 +346,8 @@ let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add VGPR_32, SReg_32)> { let isAllocatable = 0; } Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -26,6 +26,7 @@ def WriteBarrier : SchedWrite; // Vector ALU instructions +def Write16Bit : SchedWrite; def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; def WriteFullOrQuarterRate32 : SchedWrite; @@ -101,6 +102,7 @@ def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? + def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -109,6 +111,7 @@ def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; def WriteCopy : SchedWriteVariant<[ + SchedVar, SchedVar, SchedVar, SchedVar]>; Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -90,6 +90,7 @@ switch (MI.getOpcode()) { default: return false; + case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_MAC_F32_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -280,24 +280,28 @@ let SubtargetPredicate = isVI in { -defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16>; -defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16>; -defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>; +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; +defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; +defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; +defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; +defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; +defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; + +// FIXME: V_FREXP_EXP_I16_F16 requires a change to llvm.amdgcn.frexp.exp +// intrinsic. +defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16/*, int_amdgcn_frexp_exp*/>; + +defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; +defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; +defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; +defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; +defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -322,25 +322,31 @@ defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_B16 : VOP2Inst <"v_ashrrev_b16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16>; + +// FIXME: V_LDEXP_F16 requires a change to llvm.amdgcn.ldexp intrinsic. +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16/*, AMDGPUldexp*/>; let isCommutable = 1 in { -defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16>; +defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; +defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; -defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_F16_F16_F16>; +defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK>; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16>; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; + +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC>; +} } // End isCommutable = 1 } // End SubtargetPredicate = isVI Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -215,10 +215,18 @@ let SubtargetPredicate = isVI in { let isCommutable = 1 in { - def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile>; - def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; - def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; -} + +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup>; +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; +def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile>; +def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile>; +def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile>; +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; + +def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; +def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; + +} // End isCommutable = 1 } // End SubtargetPredicate = isVI @@ -415,6 +423,12 @@ defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; +defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; +defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; + +defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>; +defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>; +defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>; defm V_ADD_F64 : VOP3_Real_vi <0x280>; defm V_MUL_F64 : VOP3_Real_vi <0x281>; defm V_MIN_F64 : VOP3_Real_vi <0x282>; Index: lib/Target/AMDGPU/VOPCInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPCInstructions.td +++ lib/Target/AMDGPU/VOPCInstructions.td @@ -144,11 +144,15 @@ } } +def VOPC_I1_F16_F16 : VOPC_Profile<[Write16Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; +multiclass VOPC_F16 : + VOPC_Pseudos ; + multiclass VOPC_F32 : VOPC_Pseudos ; @@ -161,6 +165,9 @@ multiclass VOPC_I64 : VOPC_Pseudos ; +multiclass VOPCX_F16 : + VOPC_Pseudos ; + multiclass VOPCX_F32 : VOPC_Pseudos ; @@ -318,6 +325,44 @@ } // End SubtargetPredicate = isSICI +let SubtargetPredicate = isVI in { + +defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">; +defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">; +defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>; +defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">; +defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>; +defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>; +defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>; +defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>; +defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>; +defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">; +defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>; +defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">; +defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>; +defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>; +defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>; +defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">; + +defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">; +defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">; +defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">; +defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">; +defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">; +defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">; +defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">; +defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">; +defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">; +defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">; +defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">; + +} // End SubtargetPredicate = isVI + defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">; defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">; @@ -429,9 +474,16 @@ } } +def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write16Bit], f16>; def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; +multiclass VOPC_CLASS_F16 : + VOPC_Class_Pseudos ; + +multiclass VOPCX_CLASS_F16 : + VOPC_Class_Pseudos ; + multiclass VOPC_CLASS_F32 : VOPC_Class_Pseudos ; @@ -448,6 +500,8 @@ defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; +defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; +defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. @@ -810,147 +864,183 @@ } } -defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; -defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; -defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; -defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; -defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; -defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; -defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; -defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; -defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; -defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; -defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; -defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; -defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; -defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; -defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; -defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; - -defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; -defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; -defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; -defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; -defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; -defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; -defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; -defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; -defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; -defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; -defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; -defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; -defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; -defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; -defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; -defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; - -defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; -defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; -defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; -defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; -defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; -defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; -defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; -defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; -defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; -defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; -defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; -defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; -defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; -defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; -defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; -defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; - -defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; -defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; -defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; -defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; -defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; -defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; -defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; -defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; -defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; -defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; -defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; -defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; -defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; -defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; -defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; -defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; - -defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; -defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; -defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; -defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; -defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; -defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; -defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; -defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; - -defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; -defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; -defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; -defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; -defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; -defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; -defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; -defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; - -defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; -defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; -defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; -defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; -defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; -defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; -defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; -defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; - -defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; -defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; -defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; -defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; -defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; -defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; -defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; -defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; - -defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; -defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; -defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; -defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; -defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; -defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; -defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; -defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; - -defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; -defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; -defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; -defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; -defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; -defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; -defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; -defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; - -defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; -defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; -defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; -defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; -defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; -defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; -defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; -defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; - -defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; -defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; -defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; -defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; -defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; -defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; -defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; -defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; - defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>; defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>; defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>; defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>; +defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>; +defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>; + +defm V_CMP_F_F16 : VOPC_Real_vi <0x20>; +defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>; +defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>; +defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>; +defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>; +defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>; +defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>; +defm V_CMP_O_F16 : VOPC_Real_vi <0x27>; +defm V_CMP_U_F16 : VOPC_Real_vi <0x28>; +defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>; +defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>; +defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>; +defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>; +defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>; +defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>; +defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>; + +defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>; +defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>; +defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>; +defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>; +defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>; +defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>; +defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>; +defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>; +defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>; +defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>; +defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>; +defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>; +defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>; +defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>; +defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>; +defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>; + +defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; +defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; +defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; +defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; +defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; +defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; +defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; +defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; +defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; +defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; +defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; +defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; +defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; +defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; +defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; +defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; + +defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; +defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; +defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; +defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; +defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; +defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; +defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; +defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; +defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; +defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; +defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; +defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; +defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; +defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; +defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; +defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; + +defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; +defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; +defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; +defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; +defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; +defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; +defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; +defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; +defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; +defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; +defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; +defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; +defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; +defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; +defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; +defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; + +defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; +defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; +defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; +defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; +defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; +defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; +defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; +defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; +defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; +defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; +defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; +defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; +defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; +defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; +defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; +defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; + +defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; +defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; +defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; +defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; +defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; +defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; +defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; +defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; + +defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; +defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; +defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; +defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; +defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; +defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; +defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; +defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; + +defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; +defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; +defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; +defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; +defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; +defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; +defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; +defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; + +defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; +defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; +defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; +defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; +defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; +defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; +defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; +defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; + +defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; +defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; +defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; +defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; +defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; +defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; +defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; +defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; + +defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; +defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; +defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; +defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; +defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; +defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; +defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; +defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; + +defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; +defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; +defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; +defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; +defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; +defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; +defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; +defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; + +defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; +defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; +defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; +defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; +defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; +defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; +defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; +defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_add +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_add( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_add_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_add_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half 1.0, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_add_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4000{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_add_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fadd half %a.val, 2.0 + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_add +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_add( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fadd <2 x half> %a.val, %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_add_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_add_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b) { +entry: + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fadd <2 x half> , %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_add_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_add_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fadd <2 x half> %a.val, + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fcmp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fcmp.f16.ll @@ -0,0 +1,744 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_cmp_lt +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_lt( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp olt half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_eq +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_eq( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp oeq half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_le +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_le( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ole half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_gt +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_gt( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ogt half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_lg +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_lg( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp one half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_ge +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_ge( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp oge half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_o +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_o( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ord half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_u +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_u( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp uno half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_nge +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_nge( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ult half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_nlg +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_nlg( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ueq half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_ngt +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_ngt( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ule half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_nle +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_nle( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp ugt half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_neq +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_neq( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp une half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_cmp_nlt +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_cmp_nlt( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fcmp uge half %a.val, %b.val + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_lt +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_lt( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp olt <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_eq +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_eq( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp oeq <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_le +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_le( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ole <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_gt +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_gt( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ogt <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_lg +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_lg( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp one <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_ge +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_ge( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp oge <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_o +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_o( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ord <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_u +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_u( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp uno <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_nge +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_nge( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ult <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_nlg +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_nlg( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ueq <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_ngt +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_ngt( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ule <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_nle +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_nle( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp ugt <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_neq +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_neq( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp une <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cmp_nlt +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] +; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_cmp_nlt( + <2 x i32> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fcmp uge <2 x half> %a.val, %b.val + %r.val.sext = sext <2 x i1> %r.val to <2 x i32> + store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fdiv.f16.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; Make sure fdiv is promoted to f32. + +; GCN-LABEL: {{^}}simple_vt_div +; GCN: v_cvt_f32_f16 +; GCN: v_cvt_f32_f16 +; GCN: v_div_scale_f32 +; GCN-DAG: v_div_scale_f32 +; GCN-DAG: v_rcp_f32 +; GCN: v_fma_f32 +; GCN: v_fma_f32 +; GCN: v_mul_f32 +; GCN: v_fma_f32 +; GCN: v_fma_f32 +; GCN: v_fma_f32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 +; GCN: v_cvt_f16_f32 +define void @simple_vt_div( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fdiv half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_mul +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_mul( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fmul half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mul_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_mul_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = fmul half 3.0, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mul_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_mul_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fmul half %a.val, 4.0 + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mul +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_mul( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fmul <2 x half> %a.val, %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mul_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_mul_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b) { +entry: + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fmul <2 x half> , %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mul_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_mul_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fmul <2 x half> %a.val, + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_half_to_float +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_dword v[[R_F32]] +; GCN: s_endpgm +define void @simple_vt_convert_half_to_float( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_convert_half_to_double +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}} +; GCN: s_endpgm +define void @simple_vt_convert_half_to_double( + double addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fpext half %a.val to double + store double %r.val, double addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_half_to_float +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_convert_half_to_float( + <2 x float> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fpext <2 x half> %a.val to <2 x float> + store <2 x float> %r.val, <2 x float> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_half_to_double +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]] +; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]] +; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}} +; GCN: s_endpgm +define void @vector_vt_convert_half_to_double( + <2 x double> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fpext <2 x half> %a.val to <2 x double> + store <2 x double> %r.val, <2 x double> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fptosi.f16.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_half_to_signed_short +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_I16]] +; GCN: s_endpgm +define void @simple_vt_convert_half_to_signed_short( + i16 addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fptosi half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_half_to_signed_short +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] +; VI: v_cvt_i16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_i16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] +; GCN: buffer_store_dword v[[R_V2_I16]] +; GCN: s_endpgm +define void @vector_vt_convert_half_to_signed_short( + <2 x i16> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fptosi <2 x half> %a.val to <2 x i16> + store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fptoui.f16.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_half_to_unsigned_short +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_I16]] +; GCN: s_endpgm +define void @simple_vt_convert_half_to_unsigned_short( + i16 addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fptoui half %a.val to i16 + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_half_to_unsigned_short +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; VI: v_cvt_u16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_u16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]] +; VI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] +; VI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] +; GCN: buffer_store_dword v[[R_V2_I16]] +; GCN: s_endpgm +define void @vector_vt_convert_half_to_unsigned_short( + <2 x i16> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fptoui <2 x half> %a.val to <2 x i16> + store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -0,0 +1,72 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_float_to_half +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_convert_float_to_half( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %r.val = fptrunc float %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_convert_double_to_half +; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}} +; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}} +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_convert_double_to_half( + half addrspace(1)* %r, + double addrspace(1)* %a) { +entry: + %a.val = load double, double addrspace(1)* %a + %r.val = fptrunc double %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_float_to_half +; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} +; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] +; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_convert_float_to_half( + <2 x half> addrspace(1)* %r, + <2 x float> addrspace(1)* %a) { +entry: + %a.val = load <2 x float>, <2 x float> addrspace(1)* %a + %r.val = fptrunc <2 x float> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_double_to_half +; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} +; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} +; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} +; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +define void @vector_vt_convert_double_to_half( + <2 x half> addrspace(1)* %r, + <2 x double> addrspace(1)* %a) { +entry: + %a.val = load <2 x double>, <2 x double> addrspace(1)* %a + %r.val = fptrunc <2 x double> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_sub +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sub( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fsub half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_sub_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sub_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = fsub half 1.0, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_sub_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0xc000{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sub_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = fsub half %a.val, 2.0 + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_sub +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_sub( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fsub <2 x half> %a.val, %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_sub_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]] +; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_sub_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b) { +entry: + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = fsub <2 x half> , %b.val + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_sub_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_sub_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = fsub <2 x half> %a.val, + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,11 +1,12 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; half args should be promoted to float +; half args should be promoted to float for SI and lower. ; GCN-LABEL: {{^}}load_f16_arg: ; GCN: s_load_dword [[ARG:s[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] +; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] +; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]] ; GCN: buffer_store_short [[CVT]] define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { store half %arg, half addrspace(1)* %out @@ -131,8 +132,11 @@ ; GCN-LABEL: {{^}}extload_f16_to_f64_arg: ; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} +; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] ; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} -; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] +; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]] +; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]] +; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]] ; GCN: buffer_store_dwordx2 [[RESULT]] define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to double @@ -279,8 +283,9 @@ ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm @@ -387,16 +392,17 @@ ; XSI-NOT: v_cvt_f32_f16 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; XVI: v_cvt_f32_f16_e32 ; XVI: v_cvt_f32_f16_e32 -; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; XVI: v_cvt_f32_f16_e32 ; XVI-NOT: v_cvt_f32_f16 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] +; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] ; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] ; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] ; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] ; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] @@ -601,18 +607,6 @@ ret void } -; GCN-LABEL: {{^}}fsub_f16: -; GCN: v_subrev_f32_e32 -; GCN: s_endpgm -define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 - %a = load half, half addrspace(1)* %in - %b = load half, half addrspace(1)* %b_ptr - %sub = fsub half %a, %b - store half %sub, half addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}test_bitcast_from_half: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -0,0 +1,155 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.fabs.f16(half %a) +declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) + +; GCN-LABEL: {{^}}simple_vt_class +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_I32:[0-9]+]] +; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]] +; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] +; GCN: buffer_store_dword v[[R_I32]] +; GCN: s_endpgm +define void @simple_vt_class( + i32 addrspace(1)* %r, + half addrspace(1)* %a, + i32 addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load i32, i32 addrspace(1)* %b + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 %b.val) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_fabs +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[VA_F16]]|, s[[SB_I32]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_fabs( + i32 addrspace(1)* %r, + half %a.val, + i32 %b.val) { +entry: + %a.val.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs, i32 %b.val) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_fneg +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[VA_F16]], s[[SB_I32]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_fneg( + i32 addrspace(1)* %r, + half %a.val, + i32 %b.val) { +entry: + %a.val.fneg = fsub half -0.0, %a.val + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fneg, i32 %b.val) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_fabs_fneg +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[VA_F16]]|, s[[SB_I32]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_fabs_fneg( + i32 addrspace(1)* %r, + half %a.val, + i32 %b.val) { +entry: + %a.val.fabs = call half @llvm.fabs.f16(half %a.val) + %a.val.fabs.fneg = fsub half -0.0, %a.val.fabs + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs.fneg, i32 %b.val) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_1 +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], 1{{$}} +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_1( + i32 addrspace(1)* %r, + half %a.val) { +entry: + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_64 +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], 64{{$}} +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_64( + i32 addrspace(1)* %r, + half %a.val) { +entry: + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 64) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_full_mask +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}} +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e32 vcc, v[[VA_F16]], v[[MASK]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_full_mask( + i32 addrspace(1)* %r, + half %a.val) { +entry: + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1023) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_class_nine_bit_mask +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}} +; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e32 vcc, v[[VA_F16]], v[[MASK]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc +; GCN: buffer_store_dword v[[VR_I32]] +; GCN: s_endpgm +define void @simple_vt_class_nine_bit_mask( + i32 addrspace(1)* %r, + half %a.val) { +entry: + %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 511) + %r.val.sext = sext i1 %r.val to i32 + store i32 %r.val.sext, i32 addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.cos.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_cos +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_cos( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -0,0 +1,129 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c) + +; GCN-LABEL: {{^}}simple_vt_div_fixup +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b, + half addrspace(1)* %c) { +entry: + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %c) { +entry: + %a.val = load half, half addrspace(1)* %a + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_c +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_c( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_a_imm_b +; VI: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}} +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_a_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %c) { +entry: + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_b_imm_c +; VI: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}} +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_b_imm_c( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half 3.0) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_div_fixup_imm_a_imm_c +; VI: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}} +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_div_fixup_imm_a_imm_c( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half 3.0) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.fract.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_fract +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_fract( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.fract.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.frexp.mant.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_frexp_mant +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_frexp_mant( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.frexp.mant.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.rcp.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_rcp +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_rcp( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.rcp.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.rsq.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_rsq +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_rsq( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.rsq.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.amdgcn.sin.f16(half %a) + +; GCN-LABEL: {{^}}simple_vt_sin +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sin( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.amdgcn.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.ceil.f16(half %a) +declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_ceil +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_ceil( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.ceil.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_ceil +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_ceil( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.ceil.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.cos.f16(half %a) +declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_cos +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{1/2pi|0x3e22f983}}, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_cos_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_cos( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.cos.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_cos +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} +; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; VI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 1/2pi, v[[A_F32_0]] +; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; VI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 1/2pi, v[[A_F32_1]] +; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] +; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_cos( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.exp2.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.exp2.f16(half %a) +declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_exp +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_exp_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_exp( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.exp2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_exp +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_exp( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.floor.f16(half %a) +declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_floor +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_floor_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_floor( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.floor.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_floor +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_floor( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.floor.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -0,0 +1,235 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.fma.f16(half %a, half %b, half %c) +declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + +; GCN-LABEL: {{^}}simple_vt_fma +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_fma( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_fma_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_fma_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b, + half addrspace(1)* %c) { + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_fma_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_fma_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %c) { + %a.val = load half, half addrspace(1)* %a + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_fma_imm_c +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_fma_imm_c( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_fma +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] +; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_fma( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) { + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_fma_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]] +; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_fma_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) { + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> , <2 x half> %b.val, <2 x half> %c.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_fma_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} +; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] +; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_fma_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %c) { + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> , <2 x half> %c.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_fma_imm_c +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} +; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] +; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_fma_imm_c( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> ) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -0,0 +1,116 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.fmuladd.f16(half %a, half %b, half %c) +declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + +; GCN-LABEL: {{^}}simple_vt_fmuladd +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] +; SI: buffer_store_short v[[R_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI: buffer_store_short v[[C_F16]] +; GCN: s_endpgm +define void @simple_vt_fmuladd( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) { + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_fmuladd_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] +; SI: buffer_store_short v[[R_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] +; VI: buffer_store_short v[[C_F16]] +; GCN: s_endpgm +define void @simple_vt_fmuladd_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b, + half addrspace(1)* %c) { + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_fmuladd_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] +; SI: buffer_store_short v[[R_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] +; VI: buffer_store_short v[[C_F16]] +; GCN: s_endpgm +define void @simple_vt_fmuladd_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %c) { + %a.val = load half, half addrspace(1)* %a + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_fmuladd +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]] +; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] +; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] +; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]] +; VI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_fmuladd( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) { + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.log2.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.log2.f16(half %a) +declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_log +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_log_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_log( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.log2.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_log +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_log( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.log2.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -0,0 +1,153 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.maxnum.f16(half %a, half %b) +declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) + +; GCN-LABEL: {{^}}simple_vt_max +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_max( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_max_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_max_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_max_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_max_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_max +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_max( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_max_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_max_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b) { +entry: + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %b.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_max_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_max_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> ) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -0,0 +1,153 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.minnum.f16(half %a, half %b) +declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) + +; GCN-LABEL: {{^}}simple_vt_min +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_min( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_min_imm_a +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_min_imm_a( + half addrspace(1)* %r, + half addrspace(1)* %b) { +entry: + %b.val = load half, half addrspace(1)* %b + %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_min_imm_b +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_min_imm_b( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_min +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_min( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_min_imm_a +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_min_imm_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %b) { +entry: + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %b.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_min_imm_b +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_min_imm_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> ) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.rint.f16(half %a) +declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_rndne +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_rndne( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.rint.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_rndne +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_rndne( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.sin.f16(half %a) +declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_sin +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{1/2pi|0x3e22f983}}, v[[A_F32]] +; GCN: v_fract_f32_e32 v[[F_F32:[0-9]+]], v[[M_F32]] +; GCN: v_sin_f32_e32 v[[R_F32:[0-9]+]], v[[F_F32]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sin( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.sin.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_sin +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} +; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; VI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 1/2pi, v[[A_F32_0]] +; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; VI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 1/2pi, v[[A_F32_1]] +; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; GCN: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] +; GCN: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_sin( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.sqrt.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.sqrt.f16(half %a) +declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_sqrt +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_sqrt_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_sqrt( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.sqrt.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_sqrt +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_sqrt( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare half @llvm.trunc.f16(half %a) +declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}simple_vt_trunc +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_trunc_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] +; VI: v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_trunc( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.trunc.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_trunc +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_trunc( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.trunc.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/sitofp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sitofp.f16.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_signed_short_to_half +; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_i16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_convert_signed_short_to_half( + half addrspace(1)* %r, + i16 addrspace(1)* %a) { +entry: + %a.val = load i16, i16 addrspace(1)* %a + %r.val = sitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_signed_short_to_half +; GCN: buffer_load_dword v[[A_V2_I16:[0-9]+]] +; SI: v_bfe_i32 v[[A_I16_0:[0-9]+]], v[[A_V2_I16]], 0, 16 +; SI: v_ashrrev_i32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]] +; SI: v_cvt_f32_i32_e32 v[[A_F32_1:[0-9]+]], v[[A_I16_1]] +; SI: v_cvt_f32_i32_e32 v[[A_F32_0:[0-9]+]], v[[A_I16_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] +; VI: v_lshrrev_b32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]] +; VI: v_cvt_f16_i16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_I16]] +; VI: v_cvt_f16_i16_e32 v[[R_F16_1:[0-9]+]], v[[A_I16_1]] +; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_convert_signed_short_to_half( + <2 x half> addrspace(1)* %r, + <2 x i16> addrspace(1)* %a) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %r.val = sitofp <2 x i16> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/uitofp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uitofp.f16.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_convert_unsigned_short_to_half +; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] +; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_convert_unsigned_short_to_half( + half addrspace(1)* %r, + i16 addrspace(1)* %a) { +entry: + %a.val = load i16, i16 addrspace(1)* %a + %r.val = uitofp i16 %a.val to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_convert_unsigned_short_to_half +; GCN: buffer_load_dword v[[A_V2_I16:[0-9]+]] +; SI: s_mov_b32 s[[MASK:[0-9]+]], 0xffff{{$}} +; SI: v_and_b32_e32 v[[A_I16_0:[0-9]+]], s[[MASK]], v[[A_V2_I16]] +; GCN: v_lshrrev_b32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]] +; SI: v_cvt_f32_u32_e32 v[[A_F32_1:[0-9]+]], v[[A_I16_1]] +; SI: v_cvt_f32_u32_e32 v[[A_F32_0:[0-9]+]], v[[A_I16_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] +; VI: v_cvt_f16_u16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_I16]] +; VI: v_cvt_f16_u16_e32 v[[R_F16_1:[0-9]+]], v[[A_I16_1]] +; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], s[[MASK]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_convert_unsigned_short_to_half( + <2 x half> addrspace(1)* %r, + <2 x i16> addrspace(1)* %a) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %r.val = uitofp <2 x i16> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -0,0 +1,608 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_mac +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] +; SI: buffer_store_short v[[R_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI: buffer_store_short v[[C_F16]] +; GCN: s_endpgm +define void @simple_vt_mac( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %t.val = fmul half %a.val, %b.val + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_same_add +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] +; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] +; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_same_add( + half addrspace(1)* %r0, + half addrspace(1)* %r1, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c, + half addrspace(1)* %d, + half addrspace(1)* %e) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %d.val = load half, half addrspace(1)* %d + %e.val = load half, half addrspace(1)* %e + + %t0.val = fmul half %a.val, %b.val + %r0.val = fadd half %t0.val, %c.val + + %t1.val = fmul half %d.val, %e.val + %r1.val = fadd half %t1.val, %c.val + + store half %r0.val, half addrspace(1)* %r0 + store half %r1.val, half addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_a +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_a( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %a.neg = fsub half -0.0, %a.val + %t.val = fmul half %a.neg, %b.val + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_b +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_b( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %b.neg = fsub half -0.0, %b.val + %t.val = fmul half %a.val, %b.neg + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_c +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_c( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %c.neg = fsub half -0.0, %c.val + %t.val = fmul half %a.val, %b.val + %r.val = fadd half %t.val, %c.neg + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_a_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; GCN: s_endpgm +define void @simple_vt_mac_neg_a_safe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %a.neg = fsub half 0.0, %a.val + %t.val = fmul half %a.neg, %b.val + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_b_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_b_safe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %b.neg = fsub half 0.0, %b.val + %t.val = fmul half %a.val, %b.neg + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_c_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_c_safe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #0 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %c.neg = fsub half 0.0, %c.val + %t.val = fmul half %a.val, %b.val + %r.val = fadd half %t.val, %c.neg + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_a_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_a_unsafe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #1 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %a.neg = fsub half 0.0, %a.val + %t.val = fmul half %a.neg, %b.val + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_b_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_b_unsafe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #1 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %b.neg = fsub half 0.0, %b.val + %t.val = fmul half %a.val, %b.neg + %r.val = fadd half %t.val, %c.val + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_mac_neg_c_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} +; GCN: s_endpgm +define void @simple_vt_mac_neg_c_unsafe_fp_math( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) #1 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %c.neg = fsub half 0.0, %c.val + %t.val = fmul half %a.val, %b.val + %r.val = fadd half %t.val, %c.neg + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac +; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]] +; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] +; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] +; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]] +; VI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: s_endpgm +define void @vector_vt_mac( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %t.val = fmul <2 x half> %a.val, %b.val + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_same_add +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] +; SI: v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mac_f32_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] +; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] +; VI: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_same_add( + <2 x half> addrspace(1)* %r0, + <2 x half> addrspace(1)* %r1, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c, + <2 x half> addrspace(1)* %d, + <2 x half> addrspace(1)* %e) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + %d.val = load <2 x half>, <2 x half> addrspace(1)* %d + %e.val = load <2 x half>, <2 x half> addrspace(1)* %e + + %t0.val = fmul <2 x half> %a.val, %b.val + %r0.val = fadd <2 x half> %t0.val, %c.val + + %t1.val = fmul <2 x half> %d.val, %e.val + %r1.val = fadd <2 x half> %t1.val, %c.val + + store <2 x half> %r0.val, <2 x half> addrspace(1)* %r0 + store <2 x half> %r1.val, <2 x half> addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_a +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_a( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %a.neg = fsub <2 x half> , %a.val + %t.val = fmul <2 x half> %a.neg, %b.val + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_b +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_b( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %b.neg = fsub <2 x half> , %b.val + %t.val = fmul <2 x half> %a.val, %b.neg + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_c +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_c( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %c.neg = fsub <2 x half> , %c.val + %t.val = fmul <2 x half> %a.val, %b.val + %r.val = fadd <2 x half> %t.val, %c.neg + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_a_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; GCN: s_endpgm +define void @vector_vt_mac_neg_a_safe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %a.neg = fsub <2 x half> , %a.val + %t.val = fmul <2 x half> %a.neg, %b.val + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_b_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_b_safe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %b.neg = fsub <2 x half> , %b.val + %t.val = fmul <2 x half> %a.val, %b.neg + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_c_safe_fp_math +; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; SI: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_c_safe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #0 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %c.neg = fsub <2 x half> , %c.val + %t.val = fmul <2 x half> %a.val, %b.val + %r.val = fadd <2 x half> %t.val, %c.neg + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_a_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_a_unsafe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #1 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %a.neg = fsub <2 x half> , %a.val + %t.val = fmul <2 x half> %a.neg, %b.val + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_b_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_b_unsafe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #1 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %b.neg = fsub <2 x half> , %b.val + %t.val = fmul <2 x half> %a.val, %b.neg + %r.val = fadd <2 x half> %t.val, %c.val + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}vector_vt_mac_neg_c_unsafe_fp_math +; SI-NOT: v_mac_f32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; VI-NOT: v_mac_f16 +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; GCN: s_endpgm +define void @vector_vt_mac_neg_c_unsafe_fp_math( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + <2 x half> addrspace(1)* %c) #1 { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load <2 x half>, <2 x half> addrspace(1)* %c + + %c.neg = fsub <2 x half> , %c.val + %t.val = fmul <2 x half> %a.val, %b.val + %r.val = fadd <2 x half> %t.val, %c.neg + + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +attributes #0 = {"unsafe-fp-math"="false"} +attributes #1 = {"unsafe-fp-math"="true"} Index: test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/v_madak_f16.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}simple_vt_madak +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} +; VI: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @simple_vt_madak( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + + %t.val = fmul half %a.val, %b.val + %r.val = fadd half %t.val, 10.0 + + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}simple_vt_madak_use_2 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_endpgm +define void @simple_vt_madak_use_2( + half addrspace(1)* %r0, + half addrspace(1)* %r1, + half addrspace(1)* %a, + half addrspace(1)* %b, + half addrspace(1)* %c) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + + %t0.val = fmul half %a.val, %b.val + %t1.val = fmul half %a.val, %c.val + %r0.val = fadd half %t0.val, 10.0 + %r1.val = fadd half %t1.val, 10.0 + + store half %r0.val, half addrspace(1)* %r0 + store half %r1.val, half addrspace(1)* %r1 + ret void +} Index: test/MC/Disassembler/AMDGPU/sdwa_vi.txt =================================================================== --- test/MC/Disassembler/AMDGPU/sdwa_vi.txt +++ test/MC/Disassembler/AMDGPU/sdwa_vi.txt @@ -300,9 +300,6 @@ # VI: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02] 0xf9 0x06 0x02 0x44 0x02 0x06 0x05 0x02 -# VI: v_mac_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x46,0x02,0x06,0x05,0x02] -0xf9 0x06 0x02 0x46 0x02 0x06 0x05 0x02 - # VI: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02] 0xf9 0x06 0x02 0x4c 0x02 0x06 0x05 0x02