diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -122,8 +122,9 @@ VID_VL, // Matches the semantics of the vfcnvt.rod function (Convert double-width // float to single-width float, rounding towards odd). Takes a double-width - // float vector and produces a single-width float vector. - VFNCVT_ROD, + // float vector and produces a single-width float vector. Also has a mask and + // VL operand. + VFNCVT_ROD_VL, // These nodes match the semantics of the corresponding RVV vector reduction // instructions. They produce a vector result which is the reduction // performed over the first vector operand plus the first element of the @@ -175,6 +176,8 @@ UMAX_VL, MULHS_VL, MULHU_VL, + FP_ROUND_VL, + FP_EXTEND_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -587,6 +587,10 @@ // By default everything must be expanded. for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) setOperationAction(Op, VT, Expand); + for (MVT OtherVT : MVT::fp_fixedlen_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand); + setTruncStoreAction(VT, OtherVT, Expand); + } // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); @@ -605,6 +609,9 @@ setOperationAction(ISD::FSQRT, VT, Custom); setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::FP_EXTEND, VT, Custom); + for (auto CC : VFPCCToExpand) setCondCodeAction(CC, VT, Expand); @@ -1080,6 +1087,21 @@ return SDValue(); } +static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT, + SDLoc DL, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (VT.isScalableVector()) + return DAG.getFPExtendOrRound(Op, DL, VT); + assert(VT.isFixedLengthVector() && + "Unexpected value type for RVV FP extend/round lowering"); + SDValue Mask, VL; + std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + unsigned RVVOpc = ContainerVT.bitsGT(Op.getSimpleValueType()) + ? RISCVISD::FP_EXTEND_VL + : RISCVISD::FP_ROUND_VL; + return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL); +} + SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -1253,33 +1275,86 @@ // RVV can only do fp_extend to types double the size as the source. We // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going // via f32. + SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - MVT SrcVT = Op.getOperand(0).getSimpleValueType(); - // We only need to close the gap between vXf16->vXf64. + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // Prepare any fixed-length vector operands. + MVT ContainerVT = VT; + if (SrcVT.isFixedLengthVector()) { + ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector( + DAG, VT, Subtarget); + MVT SrcContainerVT = + ContainerVT.changeVectorElementType(SrcVT.getVectorElementType()); + Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget); + } + if (!VT.isVector() || VT.getVectorElementType() != MVT::f64 || - SrcVT.getVectorElementType() != MVT::f16) - return Op; - SDLoc DL(Op); - MVT InterVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); - SDValue IntermediateRound = - DAG.getFPExtendOrRound(Op.getOperand(0), DL, InterVT); - return DAG.getFPExtendOrRound(IntermediateRound, DL, VT); + SrcVT.getVectorElementType() != MVT::f16) { + // For scalable vectors, we only need to close the gap between + // vXf16->vXf64. + if (!VT.isFixedLengthVector()) + return Op; + // For fixed-length vectors, lower the FP_EXTEND to a custom "VL" version. + Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget); + return convertFromScalableVector(VT, Src, DAG, Subtarget); + } + + MVT InterVT = VT.changeVectorElementType(MVT::f32); + MVT InterContainerVT = ContainerVT.changeVectorElementType(MVT::f32); + SDValue IntermediateExtend = getRVVFPExtendOrRound( + Src, InterVT, InterContainerVT, DL, DAG, Subtarget); + + SDValue Extend = getRVVFPExtendOrRound(IntermediateExtend, VT, ContainerVT, + DL, DAG, Subtarget); + if (VT.isFixedLengthVector()) + return convertFromScalableVector(VT, Extend, DAG, Subtarget); + return Extend; } case ISD::FP_ROUND: { // RVV can only do fp_round to types half the size as the source. We // custom-lower f64->f16 rounds via RVV's round-to-odd float // conversion instruction. + SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - MVT SrcVT = Op.getOperand(0).getSimpleValueType(); - // We only need to close the gap between vXf64<->vXf16. + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // Prepare any fixed-length vector operands. + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + MVT SrcContainerVT = + RISCVTargetLowering::getContainerForFixedLengthVector(DAG, SrcVT, + Subtarget); + ContainerVT = + SrcContainerVT.changeVectorElementType(VT.getVectorElementType()); + Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget); + } + if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || - SrcVT.getVectorElementType() != MVT::f64) - return Op; - SDLoc DL(Op); - MVT InterVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + SrcVT.getVectorElementType() != MVT::f64) { + // For scalable vectors, we only need to close the gap between + // vXf64<->vXf16. + if (!VT.isFixedLengthVector()) + return Op; + // For fixed-length vectors, lower the FP_ROUND to a custom "VL" version. + Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget); + return convertFromScalableVector(VT, Src, DAG, Subtarget); + } + + SDValue Mask, VL; + std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + + MVT InterVT = ContainerVT.changeVectorElementType(MVT::f32); SDValue IntermediateRound = - DAG.getNode(RISCVISD::VFNCVT_ROD, DL, InterVT, Op.getOperand(0)); - return DAG.getFPExtendOrRound(IntermediateRound, DL, VT); + DAG.getNode(RISCVISD::VFNCVT_ROD_VL, DL, InterVT, Src, Mask, VL); + SDValue Round = getRVVFPExtendOrRound(IntermediateRound, VT, ContainerVT, + DL, DAG, Subtarget); + + if (VT.isFixedLengthVector()) + return convertFromScalableVector(VT, Round, DAG, Subtarget); + return Round; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: @@ -5413,7 +5488,7 @@ NODE_NAME_CASE(VSLIDEUP_VL) NODE_NAME_CASE(VSLIDEDOWN_VL) NODE_NAME_CASE(VID_VL) - NODE_NAME_CASE(VFNCVT_ROD) + NODE_NAME_CASE(VFNCVT_ROD_VL) NODE_NAME_CASE(VECREDUCE_ADD) NODE_NAME_CASE(VECREDUCE_UMAX) NODE_NAME_CASE(VECREDUCE_SMAX) @@ -5451,6 +5526,8 @@ NODE_NAME_CASE(UMAX_VL) NODE_NAME_CASE(MULHS_VL) NODE_NAME_CASE(MULHU_VL) + NODE_NAME_CASE(FP_ROUND_VL) + NODE_NAME_CASE(FP_EXTEND_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VMAND_VL) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -538,10 +538,6 @@ } // Predicates = [HasStdExtV] -def riscv_fncvt_rod - : SDNode<"RISCVISD::VFNCVT_ROD", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>, []>; - let Predicates = [HasStdExtV, HasStdExtF] in { // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions @@ -719,12 +715,7 @@ def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX) fwti.RegClass:$rs1, fvti.AVL, fvti.SEW)>; - - def : Pat<(fvti.Vector (riscv_fncvt_rod (fwti.Vector fwti.RegClass:$rs1))), - (!cast("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX) - fwti.RegClass:$rs1, fvti.AVL, fvti.SEW)>; } - } // Predicates = [HasStdExtV, HasStdExtF] //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -96,6 +96,20 @@ SDTCisVT<5, XLenVT>]>; def riscv_fma_vl : SDNode<"RISCVISD::FMA_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>; +def SDT_RISCVFPRoundOp_VL : SDTypeProfile<1, 3, [ + SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>, + SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT> +]>; +def SDT_RISCVFPExtendOp_VL : SDTypeProfile<1, 3, [ + SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>, + SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT> +]>; + +def riscv_fpround_vl : SDNode<"RISCVISD::FP_ROUND_VL", SDT_RISCVFPRoundOp_VL>; +def riscv_fpextend_vl : SDNode<"RISCVISD::FP_EXTEND_VL", SDT_RISCVFPExtendOp_VL>; +def riscv_fncvt_rod_vl : SDNode<"RISCVISD::VFNCVT_ROD_VL", SDT_RISCVFPRoundOp_VL>; + + def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL", SDTypeProfile<1, 5, [SDTCVecEltisVT<0, i1>, SDTCisVec<1>, @@ -740,6 +754,33 @@ fvti.LMul.MX) (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl, fvti.SEW)>; + + // 14.18. Widening Floating-Point/Integer Type-Convert Instructions + foreach fvtiToFWti = AllWidenableFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + def : Pat<(fwti.Vector (riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1), + (fvti.Mask true_mask), + (XLenVT (VLOp GPR:$vl)))), + (!cast("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX) + fvti.RegClass:$rs1, GPR:$vl, fvti.SEW)>; + } + + foreach fvtiToFWti = AllWidenableFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + def : Pat<(fvti.Vector (riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1), + (fwti.Mask true_mask), + (XLenVT (VLOp GPR:$vl)))), + (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX) + fwti.RegClass:$rs1, GPR:$vl, fvti.SEW)>; + + def : Pat<(fvti.Vector (riscv_fncvt_rod_vl (fwti.Vector fwti.RegClass:$rs1), + (fwti.Mask true_mask), + (XLenVT (VLOp GPR:$vl)))), + (!cast("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX) + fwti.RegClass:$rs1, GPR:$vl, fvti.SEW)>; + } } } // Predicates = [HasStdExtV, HasStdExtF] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -0,0 +1,285 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 + +define void @fpext_v2f16_v2f32(<2 x half>* %x, <2 x float>* %y) { +; CHECK-LABEL: fpext_v2f16_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetvli a0, a2, e16,mf2,ta,mu +; CHECK-NEXT: vfwcvt.f.f.v v26, v25 +; CHECK-NEXT: vsetvli a0, a2, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a1) +; CHECK-NEXT: ret + %a = load <2 x half>, <2 x half>* %x + %d = fpext <2 x half> %a to <2 x float> + store <2 x float> %d, <2 x float>* %y + ret void +} + +define void @fpext_v2f16_v2f64(<2 x half>* %x, <2 x double>* %y) { +; CHECK-LABEL: fpext_v2f16_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetvli a0, a2, e16,mf4,ta,mu +; CHECK-NEXT: vfwcvt.f.f.v v26, v25 +; CHECK-NEXT: vsetvli a0, a2, e32,mf2,ta,mu +; CHECK-NEXT: vfwcvt.f.f.v v25, v26 +; CHECK-NEXT: vsetvli a0, a2, e64,m1,ta,mu +; CHECK-NEXT: vse64.v v25, (a1) +; CHECK-NEXT: ret + %a = load <2 x half>, <2 x half>* %x + %d = fpext <2 x half> %a to <2 x double> + store <2 x double> %d, <2 x double>* %y + ret void +} + +define void @fpext_v8f16_v8f32(<8 x half>* %x, <8 x float>* %y) { +; LMULMAX8-LABEL: fpext_v8f16_v8f32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 8 +; LMULMAX8-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX8-NEXT: vle16.v v25, (a0) +; LMULMAX8-NEXT: vfwcvt.f.f.v v26, v25 +; LMULMAX8-NEXT: vsetvli a0, a2, e32,m2,ta,mu +; LMULMAX8-NEXT: vse32.v v26, (a1) +; LMULMAX8-NEXT: ret +; +; LMULMAX1-LABEL: fpext_v8f16_v8f32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a2, zero, 8 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, zero, 4 +; LMULMAX1-NEXT: vsetvli a3, a0, e16,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v26, v25 +; LMULMAX1-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vslidedown.vi v25, v25, 4 +; LMULMAX1-NEXT: vsetvli a2, a0, e16,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v25 +; LMULMAX1-NEXT: addi a2, a1, 16 +; LMULMAX1-NEXT: vsetvli a0, a0, e32,m1,ta,mu +; LMULMAX1-NEXT: vse32.v v27, (a2) +; LMULMAX1-NEXT: vse32.v v26, (a1) +; LMULMAX1-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %d = fpext <8 x half> %a to <8 x float> + store <8 x float> %d, <8 x float>* %y + ret void +} + +define void @fpext_v8f16_v8f64(<8 x half>* %x, <8 x double>* %y) { +; LMULMAX8-LABEL: fpext_v8f16_v8f64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 8 +; LMULMAX8-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX8-NEXT: vle16.v v25, (a0) +; LMULMAX8-NEXT: vfwcvt.f.f.v v26, v25 +; LMULMAX8-NEXT: vsetvli a0, a2, e32,m2,ta,mu +; LMULMAX8-NEXT: vfwcvt.f.f.v v28, v26 +; LMULMAX8-NEXT: vsetvli a0, a2, e64,m4,ta,mu +; LMULMAX8-NEXT: vse64.v v28, (a1) +; LMULMAX8-NEXT: ret +; +; LMULMAX1-LABEL: fpext_v8f16_v8f64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a2, zero, 8 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, zero, 4 +; LMULMAX1-NEXT: vsetvli a3, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-NEXT: addi a3, zero, 2 +; LMULMAX1-NEXT: vsetvli a4, a3, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v26 +; LMULMAX1-NEXT: vsetvli a4, a3, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v26, v27 +; LMULMAX1-NEXT: vsetvli a4, a3, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v25 +; LMULMAX1-NEXT: vsetvli a4, a3, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v28, v27 +; LMULMAX1-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vslidedown.vi v25, v25, 4 +; LMULMAX1-NEXT: vsetvli a2, a3, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v25 +; LMULMAX1-NEXT: vsetvli a2, a3, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v29, v27 +; LMULMAX1-NEXT: vsetvli a0, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: vslidedown.vi v25, v25, 2 +; LMULMAX1-NEXT: vsetvli a0, a3, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v25 +; LMULMAX1-NEXT: vsetvli a0, a3, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfwcvt.f.f.v v25, v27 +; LMULMAX1-NEXT: addi a0, a1, 48 +; LMULMAX1-NEXT: vsetvli a2, a3, e64,m1,ta,mu +; LMULMAX1-NEXT: vse64.v v25, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vse64.v v29, (a0) +; LMULMAX1-NEXT: vse64.v v28, (a1) +; LMULMAX1-NEXT: addi a0, a1, 16 +; LMULMAX1-NEXT: vse64.v v26, (a0) +; LMULMAX1-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %d = fpext <8 x half> %a to <8 x double> + store <8 x double> %d, <8 x double>* %y + ret void +} + +define void @fpround_v2f32_v2f16(<2 x float>* %x, <2 x half>* %y) { +; CHECK-LABEL: fpround_v2f32_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a3, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vsetvli a0, a2, e16,mf2,ta,mu +; CHECK-NEXT: vfncvt.f.f.w v26, v25 +; CHECK-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; CHECK-NEXT: vse16.v v26, (a1) +; CHECK-NEXT: ret + %a = load <2 x float>, <2 x float>* %x + %d = fptrunc <2 x float> %a to <2 x half> + store <2 x half> %d, <2 x half>* %y + ret void +} + +define void @fpround_v2f64_v2f16(<2 x double>* %x, <2 x half>* %y) { +; CHECK-LABEL: fpround_v2f64_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a3, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vsetvli a0, a2, e32,mf2,ta,mu +; CHECK-NEXT: vfncvt.rod.f.f.w v26, v25 +; CHECK-NEXT: vsetvli a0, a2, e16,mf4,ta,mu +; CHECK-NEXT: vfncvt.f.f.w v25, v26 +; CHECK-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; CHECK-NEXT: vse16.v v25, (a1) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %d = fptrunc <2 x double> %a to <2 x half> + store <2 x half> %d, <2 x half>* %y + ret void +} + +define void @fpround_v8f32_v8f16(<8 x float>* %x, <8 x half>* %y) { +; LMULMAX8-LABEL: fpround_v8f32_v8f16: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 8 +; LMULMAX8-NEXT: vsetvli a3, a2, e32,m2,ta,mu +; LMULMAX8-NEXT: vle32.v v26, (a0) +; LMULMAX8-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; LMULMAX8-NEXT: vfncvt.f.f.w v25, v26 +; LMULMAX8-NEXT: vse16.v v25, (a1) +; LMULMAX8-NEXT: ret +; +; LMULMAX1-LABEL: fpround_v8f32_v8f16: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -16 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: addi a2, zero, 4 +; LMULMAX1-NEXT: vsetvli a3, a2, e32,m1,ta,mu +; LMULMAX1-NEXT: addi a3, a0, 16 +; LMULMAX1-NEXT: vle32.v v25, (a3) +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: vsetvli a0, a2, e16,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v27, v25 +; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vse16.v v27, (a0) +; LMULMAX1-NEXT: vsetvli a0, a2, e16,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 +; LMULMAX1-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, zero, 8 +; LMULMAX1-NEXT: vsetvli a0, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: vle16.v v25, (sp) +; LMULMAX1-NEXT: vse16.v v25, (a1) +; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %d = fptrunc <8 x float> %a to <8 x half> + store <8 x half> %d, <8 x half>* %y + ret void +} + +define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) { +; LMULMAX8-LABEL: fpround_v8f64_v8f16: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 8 +; LMULMAX8-NEXT: vsetvli a3, a2, e64,m4,ta,mu +; LMULMAX8-NEXT: vle64.v v28, (a0) +; LMULMAX8-NEXT: vsetvli a0, a2, e32,m2,ta,mu +; LMULMAX8-NEXT: vfncvt.rod.f.f.w v26, v28 +; LMULMAX8-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; LMULMAX8-NEXT: vfncvt.f.f.w v25, v26 +; LMULMAX8-NEXT: vse16.v v25, (a1) +; LMULMAX8-NEXT: ret +; +; LMULMAX1-LABEL: fpround_v8f64_v8f16: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: addi a2, zero, 2 +; LMULMAX1-NEXT: vsetvli a3, a2, e64,m1,ta,mu +; LMULMAX1-NEXT: vle64.v v25, (a0) +; LMULMAX1-NEXT: addi a3, a0, 32 +; LMULMAX1-NEXT: vle64.v v26, (a3) +; LMULMAX1-NEXT: addi a3, a0, 48 +; LMULMAX1-NEXT: vle64.v v27, (a3) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle64.v v28, (a0) +; LMULMAX1-NEXT: vsetvli a0, a2, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.rod.f.f.w v29, v27 +; LMULMAX1-NEXT: vsetvli a0, a2, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v27, v29 +; LMULMAX1-NEXT: addi a0, sp, 12 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vse16.v v27, (a0) +; LMULMAX1-NEXT: vsetvli a0, a2, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v28 +; LMULMAX1-NEXT: vsetvli a0, a2, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 +; LMULMAX1-NEXT: addi a0, sp, 4 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vse16.v v28, (a0) +; LMULMAX1-NEXT: vsetvli a0, a2, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v26 +; LMULMAX1-NEXT: vsetvli a0, a2, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 +; LMULMAX1-NEXT: vsetvli a0, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: vse16.v v26, (a0) +; LMULMAX1-NEXT: addi a0, zero, 4 +; LMULMAX1-NEXT: vsetvli a3, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a3, sp, 8 +; LMULMAX1-NEXT: vle16.v v26, (a3) +; LMULMAX1-NEXT: addi a3, sp, 24 +; LMULMAX1-NEXT: vse16.v v26, (a3) +; LMULMAX1-NEXT: vsetvli a3, a2, e32,mf2,ta,mu +; LMULMAX1-NEXT: vfncvt.rod.f.f.w v26, v25 +; LMULMAX1-NEXT: vsetvli a3, a2, e16,mf4,ta,mu +; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 +; LMULMAX1-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: vsetvli a0, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: vle16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, zero, 8 +; LMULMAX1-NEXT: vsetvli a0, a0, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: vse16.v v25, (a1) +; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: ret + %a = load <8 x double>, <8 x double>* %x + %d = fptrunc <8 x double> %a to <8 x half> + store <8 x half> %d, <8 x half>* %y + ret void +}