diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -245,7 +245,14 @@ // Widening instructions VWMUL_VL, VWMULU_VL, + VWADD_VL, VWADDU_VL, + VWSUB_VL, + VWSUBU_VL, + VWADD_W_VL, + VWADDU_W_VL, + VWSUB_W_VL, + VWSUBU_W_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7240,6 +7240,129 @@ return SDValue(N, 0); } +// Try to form vwadd(u).wv/wx or vwsub(u).wv/wx. It might later be optimized to +// vwadd(u).vv/vx or vwsub(u).vv/vx. +static SDValue combineADDSUB_VLToVWADDSUB_VL(SDNode *N, SelectionDAG &DAG, + bool Commute = false) { + assert((N->getOpcode() == RISCVISD::ADD_VL || + N->getOpcode() == RISCVISD::SUB_VL) && "Unexpected opcode"); + bool IsAdd = N->getOpcode() == RISCVISD::ADD_VL; + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Commute) + std::swap(Op0, Op1); + + MVT VT = N->getSimpleValueType(0); + + // Determine the narrow size for a widening multiply. + unsigned NarrowSize = VT.getScalarSizeInBits() / 2; + MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize), + VT.getVectorElementCount()); + + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + + SDLoc DL(N); + + // If the RHS is a sext or zext, we can form a widening op. + if ((Op1.getOpcode() == RISCVISD::VZEXT_VL || + Op1.getOpcode() == RISCVISD::VSEXT_VL) && Op1.hasOneUse() && + Op1.getOperand(1) == Mask && Op1.getOperand(2) == VL) { + unsigned ExtOpc = Op1.getOpcode(); + Op1 = Op1.getOperand(0); + // Re-introduce narrower extends if needed. + if (Op1.getValueType() != NarrowVT) + Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL); + + unsigned WOpc; + if (ExtOpc == RISCVISD::VSEXT_VL) + WOpc = IsAdd ? RISCVISD::VWADD_W_VL : RISCVISD::VWSUB_W_VL; + else + WOpc = IsAdd ? RISCVISD::VWADDU_W_VL : RISCVISD::VWSUBU_W_VL; + + return DAG.getNode(WOpc, DL, VT, Op0, Op1, Mask, VL); + } + + // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar + // sext/zext? + + return SDValue(); +} + +// Try to convert vwadd(u).wv/wx or vwsub(u).wv/wx to vwadd(u).vv/vx or +// vwsub(u).vv/vx. +static SDValue combineVWADD_W_VL_VWSUB_W_VL(SDNode *N, SelectionDAG &DAG) { + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + + MVT VT = N->getSimpleValueType(0); + MVT NarrowVT = Op1.getSimpleValueType(); + unsigned NarrowSize = NarrowVT.getScalarSizeInBits(); + + unsigned VOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case RISCVISD::VWADD_W_VL: VOpc = RISCVISD::VWADD_VL; break; + case RISCVISD::VWSUB_W_VL: VOpc = RISCVISD::VWSUB_VL; break; + case RISCVISD::VWADDU_W_VL: VOpc = RISCVISD::VWADDU_VL; break; + case RISCVISD::VWSUBU_W_VL: VOpc = RISCVISD::VWSUBU_VL; break; + } + + bool IsSigned = N->getOpcode() == RISCVISD::VWADD_W_VL || + N->getOpcode() == RISCVISD::VWSUB_W_VL; + + SDLoc DL(N); + + // If the LHS is a sext or zext, we can narrow this op to the same size as + // the RHS. + if (((Op0.getOpcode() == RISCVISD::VZEXT_VL && !IsSigned) || + (Op0.getOpcode() == RISCVISD::VSEXT_VL && IsSigned)) && Op0.hasOneUse() && + Op0.getOperand(1) == Mask && Op0.getOperand(2) == VL) { + unsigned ExtOpc = Op0.getOpcode(); + Op0 = Op0.getOperand(0); + // Re-introduce narrower extends if needed. + if (Op0.getValueType() != NarrowVT) + Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL); + return DAG.getNode(VOpc, DL, VT, Op0, Op1, Mask, VL); + } + + bool IsAdd = N->getOpcode() == RISCVISD::VWADD_W_VL || + N->getOpcode() == RISCVISD::VWADDU_W_VL; + + // Look for splats on the left hand side of a vwadd(u).wv. We might be able + // to commute and use a vwadd(u).vx instead. + if (IsAdd && Op0.getOpcode() == RISCVISD::VMV_V_X_VL && + Op0.getOperand(1) == VL) { + Op0 = Op0.getOperand(0); + + // See if have enough sign bits or zero bits in the scalar to use a + // widening multiply by splatting to smaller element size. + unsigned EltBits = VT.getScalarSizeInBits(); + unsigned ScalarBits = Op0.getValueSizeInBits(); + // Make sure we're getting all element bits from the scalar register. + // FIXME: Support implicit sign extension of vmv.v.x? + if (ScalarBits < EltBits) + return SDValue(); + + if (IsSigned) { + if (DAG.ComputeNumSignBits(Op0) <= (ScalarBits - NarrowSize)) + return SDValue(); + } else { + APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize); + if (!DAG.MaskedValueIsZero(Op0, Mask)) + return SDValue(); + } + + Op0 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op0, VL); + return DAG.getNode(VOpc, DL, VT, Op1, Op0, Mask, VL); + } + + return SDValue(); +} + // Try to form VWMUL or VWMULU. // FIXME: Support VWMULSU. static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, @@ -7892,6 +8015,17 @@ } break; } + case RISCVISD::ADD_VL: + if (SDValue V = combineADDSUB_VLToVWADDSUB_VL(N, DAG, /*Commute*/ false)) + return V; + return combineADDSUB_VLToVWADDSUB_VL(N, DAG, /*Commute*/ true); + case RISCVISD::SUB_VL: + return combineADDSUB_VLToVWADDSUB_VL(N, DAG); + case RISCVISD::VWADD_W_VL: + case RISCVISD::VWADDU_W_VL: + case RISCVISD::VWSUB_W_VL: + case RISCVISD::VWSUBU_W_VL: + return combineVWADD_W_VL_VWSUB_W_VL(N, DAG); case RISCVISD::MUL_VL: if (SDValue V = combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ false)) return V; @@ -10081,7 +10215,14 @@ NODE_NAME_CASE(FP_ROUND_VL) NODE_NAME_CASE(VWMUL_VL) NODE_NAME_CASE(VWMULU_VL) + NODE_NAME_CASE(VWADD_VL) NODE_NAME_CASE(VWADDU_VL) + NODE_NAME_CASE(VWSUB_VL) + NODE_NAME_CASE(VWSUBU_VL) + NODE_NAME_CASE(VWADD_W_VL) + NODE_NAME_CASE(VWADDU_W_VL) + NODE_NAME_CASE(VWSUB_W_VL) + NODE_NAME_CASE(VWSUBU_W_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VMAND_VL) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -229,7 +229,22 @@ SDTCisVT<4, XLenVT>]>; def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwadd_vl : SDNode<"RISCVISD::VWADD_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; + +def SDT_RISCVVWBinOpW_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameNumEltsAs<1, 2>, + SDTCisOpSmallerThanOp<2, 1>, + SDTCisSameNumEltsAs<1, 3>, + SDTCVecEltisVT<3, i1>, + SDTCisVT<4, XLenVT>]>; +def riscv_vwadd_w_vl : SDNode<"RISCVISD::VWADD_W_VL", SDT_RISCVVWBinOpW_VL>; +def riscv_vwaddu_w_vl : SDNode<"RISCVISD::VWADDU_W_VL", SDT_RISCVVWBinOpW_VL>; +def riscv_vwsub_w_vl : SDNode<"RISCVISD::VWSUB_W_VL", SDT_RISCVVWBinOpW_VL>; +def riscv_vwsubu_w_vl : SDNode<"RISCVISD::VWSUBU_W_VL", SDT_RISCVVWBinOpW_VL>; def SDTRVVVecReduce : SDTypeProfile<1, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>, @@ -267,32 +282,35 @@ def sew32simm5 : ComplexPattern", []>; def sew64simm5 : ComplexPattern", []>; -multiclass VPatBinaryVL_VV { +multiclass VPatBinaryVL_V { def : Pat<(result_type (vop - (op_type op_reg_class:$rs1), - (op_type op_reg_class:$rs2), + (op1_type op1_reg_class:$rs1), + (op2_type op2_reg_class:$rs2), (mask_type true_mask), VLOpFrag)), - (!cast(instruction_name#"_VV_"# vlmul.MX) - op_reg_class:$rs1, - op_reg_class:$rs2, + (!cast(instruction_name#"_"#suffix#"_"# vlmul.MX) + op1_reg_class:$rs1, + op2_reg_class:$rs2, GPR:$vl, sew)>; def : Pat<(result_type (vop - (op_type op_reg_class:$rs1), - (op_type op_reg_class:$rs2), + (op1_type op1_reg_class:$rs1), + (op2_type op2_reg_class:$rs2), (mask_type V0), VLOpFrag)), - (!cast(instruction_name#"_VV_"# vlmul.MX#"_MASK") + (!cast(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK") (result_type (IMPLICIT_DEF)), - op_reg_class:$rs1, - op_reg_class:$rs2, + op1_reg_class:$rs1, + op2_reg_class:$rs2, (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>; } @@ -300,7 +318,8 @@ string instruction_name, string suffix, ValueType result_type, - ValueType vop_type, + ValueType vop1_type, + ValueType vop2_type, ValueType mask_type, int sew, LMULInfo vlmul, @@ -308,8 +327,8 @@ ComplexPattern SplatPatKind, DAGOperand xop_kind> { def : Pat<(result_type (vop - (vop_type vop_reg_class:$rs1), - (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))), + (vop1_type vop_reg_class:$rs1), + (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))), (mask_type true_mask), VLOpFrag)), (!cast(instruction_name#_#suffix#_# vlmul.MX) @@ -317,8 +336,8 @@ xop_kind:$rs2, GPR:$vl, sew)>; def : Pat<(result_type (vop - (vop_type vop_reg_class:$rs1), - (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))), + (vop1_type vop_reg_class:$rs1), + (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))), (mask_type V0), VLOpFrag)), (!cast(instruction_name#_#suffix#_# vlmul.MX#"_MASK") @@ -330,12 +349,12 @@ multiclass VPatBinaryVL_VV_VX { foreach vti = AllIntegerVectors in { - defm : VPatBinaryVL_VV; + defm : VPatBinaryVL_V; defm : VPatBinaryVL_XI; + vti.Vector, vti.Vector, vti.Vector, vti.Mask, + vti.Log2SEW, vti.LMul, vti.RegClass, SplatPat, GPR>; } } @@ -344,8 +363,8 @@ : VPatBinaryVL_VV_VX { foreach vti = AllIntegerVectors in { defm : VPatBinaryVL_XI(SplatPat#_#ImmType), ImmType>; } @@ -355,12 +374,26 @@ foreach VtiToWti = AllWidenableIntVectors in { defvar vti = VtiToWti.Vti; defvar wti = VtiToWti.Wti; - defm : VPatBinaryVL_VV; + defm : VPatBinaryVL_V; defm : VPatBinaryVL_XI; + wti.Vector, vti.Vector, vti.Vector, vti.Mask, + vti.Log2SEW, vti.LMul, vti.RegClass, SplatPat, GPR>; + } +} +multiclass VPatBinaryWVL_VV_VX_WV_WX + : VPatBinaryWVL_VV_VX { + foreach VtiToWti = AllWidenableIntVectors in { + defvar vti = VtiToWti.Vti; + defvar wti = VtiToWti.Wti; + defm : VPatBinaryVL_V; + defm : VPatBinaryVL_XI; } } @@ -394,9 +427,9 @@ multiclass VPatBinaryFPVL_VV_VF { foreach vti = AllFloatVectors in { - defm : VPatBinaryVL_VV; + defm : VPatBinaryVL_V; defm : VPatBinaryVL_VF; @@ -714,7 +747,10 @@ } // 12.2. Vector Widening Integer Add/Subtract -defm : VPatBinaryWVL_VV_VX; +defm : VPatBinaryWVL_VV_VX_WV_WX; +defm : VPatBinaryWVL_VV_VX_WV_WX; +defm : VPatBinaryWVL_VV_VX_WV_WX; +defm : VPatBinaryWVL_VV_VX_WV_WX; // 12.3. Vector Integer Extension defm : VPatExtendSDNode_V_VL @vwadd_v2i16(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwadd_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = sext <2 x i8> %a to <2 x i16> + %d = sext <2 x i8> %b to <2 x i16> + %e = add <2 x i16> %c, %d + ret <2 x i16> %e +} + +define <4 x i16> @vwadd_v4i16(<4 x i8>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwadd_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = sext <4 x i8> %a to <4 x i16> + %d = sext <4 x i8> %b to <4 x i16> + %e = add <4 x i16> %c, %d + ret <4 x i16> %e +} + +define <2 x i32> @vwadd_v2i32(<2 x i16>* %x, <2 x i16>* %y) { +; CHECK-LABEL: vwadd_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = load <2 x i16>, <2 x i16>* %y + %c = sext <2 x i16> %a to <2 x i32> + %d = sext <2 x i16> %b to <2 x i32> + %e = add <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <8 x i16> @vwadd_v8i16(<8 x i8>* %x, <8 x i8>* %y) { +; CHECK-LABEL: vwadd_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load <8 x i8>, <8 x i8>* %y + %c = sext <8 x i8> %a to <8 x i16> + %d = sext <8 x i8> %b to <8 x i16> + %e = add <8 x i16> %c, %d + ret <8 x i16> %e +} + +define <4 x i32> @vwadd_v4i32(<4 x i16>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwadd_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = sext <4 x i16> %a to <4 x i32> + %d = sext <4 x i16> %b to <4 x i32> + %e = add <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <2 x i64> @vwadd_v2i64(<2 x i32>* %x, <2 x i32>* %y) { +; CHECK-LABEL: vwadd_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load <2 x i32>, <2 x i32>* %y + %c = sext <2 x i32> %a to <2 x i64> + %d = sext <2 x i32> %b to <2 x i64> + %e = add <2 x i64> %c, %d + ret <2 x i64> %e +} + +define <16 x i16> @vwadd_v16i16(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: vwadd_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v11, (a1) +; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sext <16 x i8> %a to <16 x i16> + %d = sext <16 x i8> %b to <16 x i16> + %e = add <16 x i16> %c, %d + ret <16 x i16> %e +} + +define <8 x i32> @vwadd_v8i32(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: vwadd_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sext <8 x i16> %a to <8 x i32> + %d = sext <8 x i16> %b to <8 x i32> + %e = add <8 x i32> %c, %d + ret <8 x i32> %e +} + +define <4 x i64> @vwadd_v4i64(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: vwadd_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sext <4 x i32> %a to <4 x i64> + %d = sext <4 x i32> %b to <4 x i64> + %e = add <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <32 x i16> @vwadd_v32i16(<32 x i8>* %x, <32 x i8>* %y) { +; CHECK-LABEL: vwadd_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = sext <32 x i8> %a to <32 x i16> + %d = sext <32 x i8> %b to <32 x i16> + %e = add <32 x i16> %c, %d + ret <32 x i16> %e +} + +define <16 x i32> @vwadd_v16i32(<16 x i16>* %x, <16 x i16>* %y) { +; CHECK-LABEL: vwadd_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = sext <16 x i16> %a to <16 x i32> + %d = sext <16 x i16> %b to <16 x i32> + %e = add <16 x i32> %c, %d + ret <16 x i32> %e +} + +define <8 x i64> @vwadd_v8i64(<8 x i32>* %x, <8 x i32>* %y) { +; CHECK-LABEL: vwadd_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = sext <8 x i32> %a to <8 x i64> + %d = sext <8 x i32> %b to <8 x i64> + %e = add <8 x i64> %c, %d + ret <8 x i64> %e +} + +define <64 x i16> @vwadd_v64i16(<64 x i8>* %x, <64 x i8>* %y) { +; CHECK-LABEL: vwadd_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = load <64 x i8>, <64 x i8>* %y + %c = sext <64 x i8> %a to <64 x i16> + %d = sext <64 x i8> %b to <64 x i16> + %e = add <64 x i16> %c, %d + ret <64 x i16> %e +} + +define <32 x i32> @vwadd_v32i32(<32 x i16>* %x, <32 x i16>* %y) { +; CHECK-LABEL: vwadd_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = load <32 x i16>, <32 x i16>* %y + %c = sext <32 x i16> %a to <32 x i32> + %d = sext <32 x i16> %b to <32 x i32> + %e = add <32 x i32> %c, %d + ret <32 x i32> %e +} + +define <16 x i64> @vwadd_v16i64(<16 x i32>* %x, <16 x i32>* %y) { +; CHECK-LABEL: vwadd_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = load <16 x i32>, <16 x i32>* %y + %c = sext <16 x i32> %a to <16 x i64> + %d = sext <16 x i32> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <128 x i16> @vwadd_v128i16(<128 x i8>* %x, <128 x i8>* %y) nounwind { +; CHECK-LABEL: vwadd_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <128 x i8>, <128 x i8>* %x + %b = load <128 x i8>, <128 x i8>* %y + %c = sext <128 x i8> %a to <128 x i16> + %d = sext <128 x i8> %b to <128 x i16> + %e = add <128 x i16> %c, %d + ret <128 x i16> %e +} + +define <64 x i32> @vwadd_v64i32(<64 x i16>* %x, <64 x i16>* %y) nounwind { +; CHECK-LABEL: vwadd_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %x + %b = load <64 x i16>, <64 x i16>* %y + %c = sext <64 x i16> %a to <64 x i32> + %d = sext <64 x i16> %b to <64 x i32> + %e = add <64 x i32> %c, %d + ret <64 x i32> %e +} + +define <32 x i64> @vwadd_v32i64(<32 x i32>* %x, <32 x i32>* %y) nounwind { +; CHECK-LABEL: vwadd_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %x + %b = load <32 x i32>, <32 x i32>* %y + %c = sext <32 x i32> %a to <32 x i64> + %d = sext <32 x i32> %b to <32 x i64> + %e = add <32 x i64> %c, %d + ret <32 x i64> %e +} + +define <2 x i32> @vwadd_v2i32_v2i8(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwadd_v2i32_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vwadd.vv v8, v11, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = sext <2 x i8> %a to <2 x i32> + %d = sext <2 x i8> %b to <2 x i32> + %e = add <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <4 x i32> @vwadd_v4i32_v4i8_v4i16(<4 x i8>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwadd_v4i32_v4i8_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vwadd.vv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = sext <4 x i8> %a to <4 x i32> + %d = sext <4 x i16> %b to <4 x i32> + %e = add <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <4 x i64> @vwadd_v4i64_v4i32_v4i8(<4 x i32>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwadd_v4i64_v4i32_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsext.vf4 v11, v8 +; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = sext <4 x i32> %a to <4 x i64> + %d = sext <4 x i8> %b to <4 x i64> + %e = add <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <2 x i16> @vwadd_vx_v2i16(<2 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = insertelement <2 x i8> undef, i8 %y, i32 0 + %c = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %d = sext <2 x i8> %a to <2 x i16> + %e = sext <2 x i8> %c to <2 x i16> + %f = add <2 x i16> %d, %e + ret <2 x i16> %f +} + +define <4 x i16> @vwadd_vx_v4i16(<4 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = insertelement <4 x i8> undef, i8 %y, i32 0 + %c = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %d = sext <4 x i8> %a to <4 x i16> + %e = sext <4 x i8> %c to <4 x i16> + %f = add <4 x i16> %d, %e + ret <4 x i16> %f +} + +define <2 x i32> @vwadd_vx_v2i32(<2 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwadd_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = insertelement <2 x i16> undef, i16 %y, i32 0 + %c = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %d = sext <2 x i16> %a to <2 x i32> + %e = sext <2 x i16> %c to <2 x i32> + %f = add <2 x i32> %d, %e + ret <2 x i32> %f +} + +define <8 x i16> @vwadd_vx_v8i16(<8 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = insertelement <8 x i8> undef, i8 %y, i32 0 + %c = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %d = sext <8 x i8> %a to <8 x i16> + %e = sext <8 x i8> %c to <8 x i16> + %f = add <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <4 x i32> @vwadd_vx_v4i32(<4 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwadd_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = insertelement <4 x i16> undef, i16 %y, i32 0 + %c = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %d = sext <4 x i16> %a to <4 x i32> + %e = sext <4 x i16> %c to <4 x i32> + %f = add <4 x i32> %d, %e + ret <4 x i32> %f +} + +define <2 x i64> @vwadd_vx_v2i64(<2 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwadd_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = insertelement <2 x i32> undef, i32 %y, i64 0 + %c = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %d = sext <2 x i32> %a to <2 x i64> + %e = sext <2 x i32> %c to <2 x i64> + %f = add <2 x i64> %d, %e + ret <2 x i64> %f +} + +define <16 x i16> @vwadd_vx_v16i16(<16 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = insertelement <16 x i8> undef, i8 %y, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %d = sext <16 x i8> %a to <16 x i16> + %e = sext <16 x i8> %c to <16 x i16> + %f = add <16 x i16> %d, %e + ret <16 x i16> %f +} + +define <8 x i32> @vwadd_vx_v8i32(<8 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwadd_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = insertelement <8 x i16> undef, i16 %y, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %d = sext <8 x i16> %a to <8 x i32> + %e = sext <8 x i16> %c to <8 x i32> + %f = add <8 x i32> %d, %e + ret <8 x i32> %f +} + +define <4 x i64> @vwadd_vx_v4i64(<4 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwadd_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = insertelement <4 x i32> undef, i32 %y, i64 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %d = sext <4 x i32> %a to <4 x i64> + %e = sext <4 x i32> %c to <4 x i64> + %f = add <4 x i64> %d, %e + ret <4 x i64> %f +} + +define <32 x i16> @vwadd_vx_v32i16(<32 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = insertelement <32 x i8> undef, i8 %y, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %d = sext <32 x i8> %a to <32 x i16> + %e = sext <32 x i8> %c to <32 x i16> + %f = add <32 x i16> %d, %e + ret <32 x i16> %f +} + +define <16 x i32> @vwadd_vx_v16i32(<16 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwadd_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = insertelement <16 x i16> undef, i16 %y, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %d = sext <16 x i16> %a to <16 x i32> + %e = sext <16 x i16> %c to <16 x i32> + %f = add <16 x i32> %d, %e + ret <16 x i32> %f +} + +define <8 x i64> @vwadd_vx_v8i64(<8 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwadd_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = insertelement <8 x i32> undef, i32 %y, i64 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %d = sext <8 x i32> %a to <8 x i64> + %e = sext <8 x i32> %c to <8 x i64> + %f = add <8 x i64> %d, %e + ret <8 x i64> %f +} + +define <64 x i16> @vwadd_vx_v64i16(<64 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwadd_vx_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = insertelement <64 x i8> undef, i8 %y, i32 0 + %c = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %d = sext <64 x i8> %a to <64 x i16> + %e = sext <64 x i8> %c to <64 x i16> + %f = add <64 x i16> %d, %e + ret <64 x i16> %f +} + +define <32 x i32> @vwadd_vx_v32i32(<32 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwadd_vx_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = insertelement <32 x i16> undef, i16 %y, i32 0 + %c = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %d = sext <32 x i16> %a to <32 x i32> + %e = sext <32 x i16> %c to <32 x i32> + %f = add <32 x i32> %d, %e + ret <32 x i32> %f +} + +define <16 x i64> @vwadd_vx_v16i64(<16 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwadd_vx_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = insertelement <16 x i32> undef, i32 %y, i64 0 + %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %d = sext <16 x i32> %a to <16 x i64> + %e = sext <16 x i32> %c to <16 x i64> + %f = add <16 x i64> %d, %e + ret <16 x i64> %f +} + +define <8 x i16> @vwadd_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwadd_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = add <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwadd_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwadd_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vwadd.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = add <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwadd_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwadd_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwadd_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwadd_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwadd_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwadd_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vlse32.v v10, (a1), zero +; CHECK-NEXT: vwadd.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwadd_vx_v2i64_i8(<2 x i32>* %x, i8* %y) nounwind { +; RV32-LABEL: vwadd_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lb a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwadd.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lb a0, 0(a1) +; RV64-NEXT: vwadd.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwadd_vx_v2i64_i16(<2 x i32>* %x, i16* %y) nounwind { +; RV32-LABEL: vwadd_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lh a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwadd.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lh a0, 0(a1) +; RV64-NEXT: vwadd.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwadd_vx_v2i64_i32(<2 x i32>* %x, i32* %y) nounwind { +; RV32-LABEL: vwadd_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwadd.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lw a0, 0(a1) +; RV64-NEXT: vwadd.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = sext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwadd_vx_v2i64_i64(<2 x i32>* %x, i64* %y) nounwind { +; RV32-LABEL: vwadd_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwadd.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vlse64.v v10, (a1), zero +; RV64-NEXT: vwadd.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -0,0 +1,863 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define <2 x i16> @vwaddu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwaddu_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = zext <2 x i8> %a to <2 x i16> + %d = zext <2 x i8> %b to <2 x i16> + %e = add <2 x i16> %c, %d + ret <2 x i16> %e +} + +define <4 x i16> @vwaddu_v4i16(<4 x i8>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwaddu_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = zext <4 x i8> %a to <4 x i16> + %d = zext <4 x i8> %b to <4 x i16> + %e = add <4 x i16> %c, %d + ret <4 x i16> %e +} + +define <2 x i32> @vwaddu_v2i32(<2 x i16>* %x, <2 x i16>* %y) { +; CHECK-LABEL: vwaddu_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = load <2 x i16>, <2 x i16>* %y + %c = zext <2 x i16> %a to <2 x i32> + %d = zext <2 x i16> %b to <2 x i32> + %e = add <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <8 x i16> @vwaddu_v8i16(<8 x i8>* %x, <8 x i8>* %y) { +; CHECK-LABEL: vwaddu_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load <8 x i8>, <8 x i8>* %y + %c = zext <8 x i8> %a to <8 x i16> + %d = zext <8 x i8> %b to <8 x i16> + %e = add <8 x i16> %c, %d + ret <8 x i16> %e +} + +define <4 x i32> @vwaddu_v4i32(<4 x i16>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwaddu_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = zext <4 x i16> %a to <4 x i32> + %d = zext <4 x i16> %b to <4 x i32> + %e = add <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <2 x i64> @vwaddu_v2i64(<2 x i32>* %x, <2 x i32>* %y) { +; CHECK-LABEL: vwaddu_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load <2 x i32>, <2 x i32>* %y + %c = zext <2 x i32> %a to <2 x i64> + %d = zext <2 x i32> %b to <2 x i64> + %e = add <2 x i64> %c, %d + ret <2 x i64> %e +} + +define <16 x i16> @vwaddu_v16i16(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: vwaddu_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v11, (a1) +; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = zext <16 x i8> %a to <16 x i16> + %d = zext <16 x i8> %b to <16 x i16> + %e = add <16 x i16> %c, %d + ret <16 x i16> %e +} + +define <8 x i32> @vwaddu_v8i32(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: vwaddu_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = zext <8 x i16> %a to <8 x i32> + %d = zext <8 x i16> %b to <8 x i32> + %e = add <8 x i32> %c, %d + ret <8 x i32> %e +} + +define <4 x i64> @vwaddu_v4i64(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: vwaddu_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = zext <4 x i32> %a to <4 x i64> + %d = zext <4 x i32> %b to <4 x i64> + %e = add <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <32 x i16> @vwaddu_v32i16(<32 x i8>* %x, <32 x i8>* %y) { +; CHECK-LABEL: vwaddu_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = zext <32 x i8> %a to <32 x i16> + %d = zext <32 x i8> %b to <32 x i16> + %e = add <32 x i16> %c, %d + ret <32 x i16> %e +} + +define <16 x i32> @vwaddu_v16i32(<16 x i16>* %x, <16 x i16>* %y) { +; CHECK-LABEL: vwaddu_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = zext <16 x i16> %a to <16 x i32> + %d = zext <16 x i16> %b to <16 x i32> + %e = add <16 x i32> %c, %d + ret <16 x i32> %e +} + +define <8 x i64> @vwaddu_v8i64(<8 x i32>* %x, <8 x i32>* %y) { +; CHECK-LABEL: vwaddu_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = zext <8 x i32> %a to <8 x i64> + %d = zext <8 x i32> %b to <8 x i64> + %e = add <8 x i64> %c, %d + ret <8 x i64> %e +} + +define <64 x i16> @vwaddu_v64i16(<64 x i8>* %x, <64 x i8>* %y) { +; CHECK-LABEL: vwaddu_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = load <64 x i8>, <64 x i8>* %y + %c = zext <64 x i8> %a to <64 x i16> + %d = zext <64 x i8> %b to <64 x i16> + %e = add <64 x i16> %c, %d + ret <64 x i16> %e +} + +define <32 x i32> @vwaddu_v32i32(<32 x i16>* %x, <32 x i16>* %y) { +; CHECK-LABEL: vwaddu_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = load <32 x i16>, <32 x i16>* %y + %c = zext <32 x i16> %a to <32 x i32> + %d = zext <32 x i16> %b to <32 x i32> + %e = add <32 x i32> %c, %d + ret <32 x i32> %e +} + +define <16 x i64> @vwaddu_v16i64(<16 x i32>* %x, <16 x i32>* %y) { +; CHECK-LABEL: vwaddu_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = load <16 x i32>, <16 x i32>* %y + %c = zext <16 x i32> %a to <16 x i64> + %d = zext <16 x i32> %b to <16 x i64> + %e = add <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <128 x i16> @vwaddu_v128i16(<128 x i8>* %x, <128 x i8>* %y) nounwind { +; CHECK-LABEL: vwaddu_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <128 x i8>, <128 x i8>* %x + %b = load <128 x i8>, <128 x i8>* %y + %c = zext <128 x i8> %a to <128 x i16> + %d = zext <128 x i8> %b to <128 x i16> + %e = add <128 x i16> %c, %d + ret <128 x i16> %e +} + +define <64 x i32> @vwaddu_v64i32(<64 x i16>* %x, <64 x i16>* %y) nounwind { +; CHECK-LABEL: vwaddu_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %x + %b = load <64 x i16>, <64 x i16>* %y + %c = zext <64 x i16> %a to <64 x i32> + %d = zext <64 x i16> %b to <64 x i32> + %e = add <64 x i32> %c, %d + ret <64 x i32> %e +} + +define <32 x i64> @vwaddu_v32i64(<32 x i32>* %x, <32 x i32>* %y) nounwind { +; CHECK-LABEL: vwaddu_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %x + %b = load <32 x i32>, <32 x i32>* %y + %c = zext <32 x i32> %a to <32 x i64> + %d = zext <32 x i32> %b to <32 x i64> + %e = add <32 x i64> %c, %d + ret <32 x i64> %e +} + +define <2 x i32> @vwaddu_v2i32_v2i8(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwaddu_v2i32_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vwaddu.vv v8, v11, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = zext <2 x i8> %a to <2 x i32> + %d = zext <2 x i8> %b to <2 x i32> + %e = add <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <4 x i32> @vwaddu_v4i32_v4i8_v4i16(<4 x i8>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwaddu_v4i32_v4i8_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vwaddu.vv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = zext <4 x i8> %a to <4 x i32> + %d = zext <4 x i16> %b to <4 x i32> + %e = add <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <4 x i64> @vwaddu_v4i64_v4i32_v4i8(<4 x i32>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwaddu_v4i64_v4i32_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vzext.vf4 v11, v8 +; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = zext <4 x i32> %a to <4 x i64> + %d = zext <4 x i8> %b to <4 x i64> + %e = add <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <2 x i16> @vwaddu_vx_v2i16(<2 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = insertelement <2 x i8> undef, i8 %y, i32 0 + %c = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %d = zext <2 x i8> %a to <2 x i16> + %e = zext <2 x i8> %c to <2 x i16> + %f = add <2 x i16> %d, %e + ret <2 x i16> %f +} + +define <4 x i16> @vwaddu_vx_v4i16(<4 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = insertelement <4 x i8> undef, i8 %y, i32 0 + %c = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %d = zext <4 x i8> %a to <4 x i16> + %e = zext <4 x i8> %c to <4 x i16> + %f = add <4 x i16> %d, %e + ret <4 x i16> %f +} + +define <2 x i32> @vwaddu_vx_v2i32(<2 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwaddu_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = insertelement <2 x i16> undef, i16 %y, i32 0 + %c = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %d = zext <2 x i16> %a to <2 x i32> + %e = zext <2 x i16> %c to <2 x i32> + %f = add <2 x i32> %d, %e + ret <2 x i32> %f +} + +define <8 x i16> @vwaddu_vx_v8i16(<8 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = insertelement <8 x i8> undef, i8 %y, i32 0 + %c = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %d = zext <8 x i8> %a to <8 x i16> + %e = zext <8 x i8> %c to <8 x i16> + %f = add <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <4 x i32> @vwaddu_vx_v4i32(<4 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwaddu_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = insertelement <4 x i16> undef, i16 %y, i32 0 + %c = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %d = zext <4 x i16> %a to <4 x i32> + %e = zext <4 x i16> %c to <4 x i32> + %f = add <4 x i32> %d, %e + ret <4 x i32> %f +} + +define <2 x i64> @vwaddu_vx_v2i64(<2 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwaddu_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = insertelement <2 x i32> undef, i32 %y, i64 0 + %c = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %d = zext <2 x i32> %a to <2 x i64> + %e = zext <2 x i32> %c to <2 x i64> + %f = add <2 x i64> %d, %e + ret <2 x i64> %f +} + +define <16 x i16> @vwaddu_vx_v16i16(<16 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = insertelement <16 x i8> undef, i8 %y, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %d = zext <16 x i8> %a to <16 x i16> + %e = zext <16 x i8> %c to <16 x i16> + %f = add <16 x i16> %d, %e + ret <16 x i16> %f +} + +define <8 x i32> @vwaddu_vx_v8i32(<8 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwaddu_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = insertelement <8 x i16> undef, i16 %y, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %d = zext <8 x i16> %a to <8 x i32> + %e = zext <8 x i16> %c to <8 x i32> + %f = add <8 x i32> %d, %e + ret <8 x i32> %f +} + +define <4 x i64> @vwaddu_vx_v4i64(<4 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwaddu_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = insertelement <4 x i32> undef, i32 %y, i64 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %d = zext <4 x i32> %a to <4 x i64> + %e = zext <4 x i32> %c to <4 x i64> + %f = add <4 x i64> %d, %e + ret <4 x i64> %f +} + +define <32 x i16> @vwaddu_vx_v32i16(<32 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = insertelement <32 x i8> undef, i8 %y, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %d = zext <32 x i8> %a to <32 x i16> + %e = zext <32 x i8> %c to <32 x i16> + %f = add <32 x i16> %d, %e + ret <32 x i16> %f +} + +define <16 x i32> @vwaddu_vx_v16i32(<16 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwaddu_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = insertelement <16 x i16> undef, i16 %y, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %d = zext <16 x i16> %a to <16 x i32> + %e = zext <16 x i16> %c to <16 x i32> + %f = add <16 x i32> %d, %e + ret <16 x i32> %f +} + +define <8 x i64> @vwaddu_vx_v8i64(<8 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwaddu_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = insertelement <8 x i32> undef, i32 %y, i64 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %d = zext <8 x i32> %a to <8 x i64> + %e = zext <8 x i32> %c to <8 x i64> + %f = add <8 x i64> %d, %e + ret <8 x i64> %f +} + +define <64 x i16> @vwaddu_vx_v64i16(<64 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwaddu_vx_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = insertelement <64 x i8> undef, i8 %y, i32 0 + %c = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %d = zext <64 x i8> %a to <64 x i16> + %e = zext <64 x i8> %c to <64 x i16> + %f = add <64 x i16> %d, %e + ret <64 x i16> %f +} + +define <32 x i32> @vwaddu_vx_v32i32(<32 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwaddu_vx_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = insertelement <32 x i16> undef, i16 %y, i32 0 + %c = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %d = zext <32 x i16> %a to <32 x i32> + %e = zext <32 x i16> %c to <32 x i32> + %f = add <32 x i32> %d, %e + ret <32 x i32> %f +} + +define <16 x i64> @vwaddu_vx_v16i64(<16 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwaddu_vx_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = insertelement <16 x i32> undef, i32 %y, i64 0 + %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %d = zext <16 x i32> %a to <16 x i64> + %e = zext <16 x i32> %c to <16 x i64> + %f = add <16 x i64> %d, %e + ret <16 x i64> %f +} + +define <8 x i16> @vwaddu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwaddu_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = add <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwaddu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwaddu_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vwaddu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = add <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwaddu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwaddu_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwaddu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwaddu_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lhu a0, 0(a1) +; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwaddu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwaddu_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vlse32.v v10, (a1), zero +; CHECK-NEXT: vwaddu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = add <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwaddu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) nounwind { +; RV32-LABEL: vwaddu_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwaddu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwaddu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) nounwind { +; RV32-LABEL: vwaddu_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lhu a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwaddu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwaddu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) nounwind { +; RV32-LABEL: vwaddu_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwaddu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = zext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwaddu_vx_v2i64_i64(<2 x i32>* %x, i64* %y) nounwind { +; RV32-LABEL: vwaddu_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwaddu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vlse64.v v10, (a1), zero +; RV64-NEXT: vwaddu.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = add <2 x i64> %e, %f + ret <2 x i64> %g +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -0,0 +1,884 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define <2 x i16> @vwsub_v2i16(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwsub_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = sext <2 x i8> %a to <2 x i16> + %d = sext <2 x i8> %b to <2 x i16> + %e = sub <2 x i16> %c, %d + ret <2 x i16> %e +} + +define <4 x i16> @vwsub_v4i16(<4 x i8>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwsub_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = sext <4 x i8> %a to <4 x i16> + %d = sext <4 x i8> %b to <4 x i16> + %e = sub <4 x i16> %c, %d + ret <4 x i16> %e +} + +define <2 x i32> @vwsub_v2i32(<2 x i16>* %x, <2 x i16>* %y) { +; CHECK-LABEL: vwsub_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = load <2 x i16>, <2 x i16>* %y + %c = sext <2 x i16> %a to <2 x i32> + %d = sext <2 x i16> %b to <2 x i32> + %e = sub <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <8 x i16> @vwsub_v8i16(<8 x i8>* %x, <8 x i8>* %y) { +; CHECK-LABEL: vwsub_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load <8 x i8>, <8 x i8>* %y + %c = sext <8 x i8> %a to <8 x i16> + %d = sext <8 x i8> %b to <8 x i16> + %e = sub <8 x i16> %c, %d + ret <8 x i16> %e +} + +define <4 x i32> @vwsub_v4i32(<4 x i16>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwsub_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = sext <4 x i16> %a to <4 x i32> + %d = sext <4 x i16> %b to <4 x i32> + %e = sub <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <2 x i64> @vwsub_v2i64(<2 x i32>* %x, <2 x i32>* %y) { +; CHECK-LABEL: vwsub_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load <2 x i32>, <2 x i32>* %y + %c = sext <2 x i32> %a to <2 x i64> + %d = sext <2 x i32> %b to <2 x i64> + %e = sub <2 x i64> %c, %d + ret <2 x i64> %e +} + +define <16 x i16> @vwsub_v16i16(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: vwsub_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v11, (a1) +; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sext <16 x i8> %a to <16 x i16> + %d = sext <16 x i8> %b to <16 x i16> + %e = sub <16 x i16> %c, %d + ret <16 x i16> %e +} + +define <8 x i32> @vwsub_v8i32(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: vwsub_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sext <8 x i16> %a to <8 x i32> + %d = sext <8 x i16> %b to <8 x i32> + %e = sub <8 x i32> %c, %d + ret <8 x i32> %e +} + +define <4 x i64> @vwsub_v4i64(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: vwsub_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sext <4 x i32> %a to <4 x i64> + %d = sext <4 x i32> %b to <4 x i64> + %e = sub <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <32 x i16> @vwsub_v32i16(<32 x i8>* %x, <32 x i8>* %y) { +; CHECK-LABEL: vwsub_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = sext <32 x i8> %a to <32 x i16> + %d = sext <32 x i8> %b to <32 x i16> + %e = sub <32 x i16> %c, %d + ret <32 x i16> %e +} + +define <16 x i32> @vwsub_v16i32(<16 x i16>* %x, <16 x i16>* %y) { +; CHECK-LABEL: vwsub_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = sext <16 x i16> %a to <16 x i32> + %d = sext <16 x i16> %b to <16 x i32> + %e = sub <16 x i32> %c, %d + ret <16 x i32> %e +} + +define <8 x i64> @vwsub_v8i64(<8 x i32>* %x, <8 x i32>* %y) { +; CHECK-LABEL: vwsub_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = sext <8 x i32> %a to <8 x i64> + %d = sext <8 x i32> %b to <8 x i64> + %e = sub <8 x i64> %c, %d + ret <8 x i64> %e +} + +define <64 x i16> @vwsub_v64i16(<64 x i8>* %x, <64 x i8>* %y) { +; CHECK-LABEL: vwsub_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = load <64 x i8>, <64 x i8>* %y + %c = sext <64 x i8> %a to <64 x i16> + %d = sext <64 x i8> %b to <64 x i16> + %e = sub <64 x i16> %c, %d + ret <64 x i16> %e +} + +define <32 x i32> @vwsub_v32i32(<32 x i16>* %x, <32 x i16>* %y) { +; CHECK-LABEL: vwsub_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = load <32 x i16>, <32 x i16>* %y + %c = sext <32 x i16> %a to <32 x i32> + %d = sext <32 x i16> %b to <32 x i32> + %e = sub <32 x i32> %c, %d + ret <32 x i32> %e +} + +define <16 x i64> @vwsub_v16i64(<16 x i32>* %x, <16 x i32>* %y) { +; CHECK-LABEL: vwsub_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = load <16 x i32>, <16 x i32>* %y + %c = sext <16 x i32> %a to <16 x i64> + %d = sext <16 x i32> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <128 x i16> @vwsub_v128i16(<128 x i8>* %x, <128 x i8>* %y) nounwind { +; CHECK-LABEL: vwsub_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <128 x i8>, <128 x i8>* %x + %b = load <128 x i8>, <128 x i8>* %y + %c = sext <128 x i8> %a to <128 x i16> + %d = sext <128 x i8> %b to <128 x i16> + %e = sub <128 x i16> %c, %d + ret <128 x i16> %e +} + +define <64 x i32> @vwsub_v64i32(<64 x i16>* %x, <64 x i16>* %y) nounwind { +; CHECK-LABEL: vwsub_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %x + %b = load <64 x i16>, <64 x i16>* %y + %c = sext <64 x i16> %a to <64 x i32> + %d = sext <64 x i16> %b to <64 x i32> + %e = sub <64 x i32> %c, %d + ret <64 x i32> %e +} + +define <32 x i64> @vwsub_v32i64(<32 x i32>* %x, <32 x i32>* %y) nounwind { +; CHECK-LABEL: vwsub_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %x + %b = load <32 x i32>, <32 x i32>* %y + %c = sext <32 x i32> %a to <32 x i64> + %d = sext <32 x i32> %b to <32 x i64> + %e = sub <32 x i64> %c, %d + ret <32 x i64> %e +} + +define <2 x i32> @vwsub_v2i32_v2i8(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwsub_v2i32_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vwsub.vv v8, v11, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = sext <2 x i8> %a to <2 x i32> + %d = sext <2 x i8> %b to <2 x i32> + %e = sub <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <4 x i32> @vwsub_v4i32_v4i8_v4i16(<4 x i8>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwsub_v4i32_v4i8_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = sext <4 x i8> %a to <4 x i32> + %d = sext <4 x i16> %b to <4 x i32> + %e = sub <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <4 x i64> @vwsub_v4i64_v4i32_v4i8(<4 x i32>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwsub_v4i64_v4i32_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsext.vf4 v11, v8 +; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = sext <4 x i32> %a to <4 x i64> + %d = sext <4 x i8> %b to <4 x i64> + %e = sub <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <2 x i16> @vwsub_vx_v2i16(<2 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = insertelement <2 x i8> undef, i8 %y, i32 0 + %c = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %d = sext <2 x i8> %a to <2 x i16> + %e = sext <2 x i8> %c to <2 x i16> + %f = sub <2 x i16> %d, %e + ret <2 x i16> %f +} + +define <4 x i16> @vwsub_vx_v4i16(<4 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = insertelement <4 x i8> undef, i8 %y, i32 0 + %c = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %d = sext <4 x i8> %a to <4 x i16> + %e = sext <4 x i8> %c to <4 x i16> + %f = sub <4 x i16> %d, %e + ret <4 x i16> %f +} + +define <2 x i32> @vwsub_vx_v2i32(<2 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsub_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = insertelement <2 x i16> undef, i16 %y, i32 0 + %c = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %d = sext <2 x i16> %a to <2 x i32> + %e = sext <2 x i16> %c to <2 x i32> + %f = sub <2 x i32> %d, %e + ret <2 x i32> %f +} + +define <8 x i16> @vwsub_vx_v8i16(<8 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = insertelement <8 x i8> undef, i8 %y, i32 0 + %c = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %d = sext <8 x i8> %a to <8 x i16> + %e = sext <8 x i8> %c to <8 x i16> + %f = sub <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <4 x i32> @vwsub_vx_v4i32(<4 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsub_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = insertelement <4 x i16> undef, i16 %y, i32 0 + %c = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %d = sext <4 x i16> %a to <4 x i32> + %e = sext <4 x i16> %c to <4 x i32> + %f = sub <4 x i32> %d, %e + ret <4 x i32> %f +} + +define <2 x i64> @vwsub_vx_v2i64(<2 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsub_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = insertelement <2 x i32> undef, i32 %y, i64 0 + %c = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %d = sext <2 x i32> %a to <2 x i64> + %e = sext <2 x i32> %c to <2 x i64> + %f = sub <2 x i64> %d, %e + ret <2 x i64> %f +} + +define <16 x i16> @vwsub_vx_v16i16(<16 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = insertelement <16 x i8> undef, i8 %y, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %d = sext <16 x i8> %a to <16 x i16> + %e = sext <16 x i8> %c to <16 x i16> + %f = sub <16 x i16> %d, %e + ret <16 x i16> %f +} + +define <8 x i32> @vwsub_vx_v8i32(<8 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsub_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = insertelement <8 x i16> undef, i16 %y, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %d = sext <8 x i16> %a to <8 x i32> + %e = sext <8 x i16> %c to <8 x i32> + %f = sub <8 x i32> %d, %e + ret <8 x i32> %f +} + +define <4 x i64> @vwsub_vx_v4i64(<4 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsub_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = insertelement <4 x i32> undef, i32 %y, i64 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %d = sext <4 x i32> %a to <4 x i64> + %e = sext <4 x i32> %c to <4 x i64> + %f = sub <4 x i64> %d, %e + ret <4 x i64> %f +} + +define <32 x i16> @vwsub_vx_v32i16(<32 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = insertelement <32 x i8> undef, i8 %y, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %d = sext <32 x i8> %a to <32 x i16> + %e = sext <32 x i8> %c to <32 x i16> + %f = sub <32 x i16> %d, %e + ret <32 x i16> %f +} + +define <16 x i32> @vwsub_vx_v16i32(<16 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsub_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = insertelement <16 x i16> undef, i16 %y, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %d = sext <16 x i16> %a to <16 x i32> + %e = sext <16 x i16> %c to <16 x i32> + %f = sub <16 x i32> %d, %e + ret <16 x i32> %f +} + +define <8 x i64> @vwsub_vx_v8i64(<8 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsub_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = insertelement <8 x i32> undef, i32 %y, i64 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %d = sext <8 x i32> %a to <8 x i64> + %e = sext <8 x i32> %c to <8 x i64> + %f = sub <8 x i64> %d, %e + ret <8 x i64> %f +} + +define <64 x i16> @vwsub_vx_v64i16(<64 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsub_vx_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = insertelement <64 x i8> undef, i8 %y, i32 0 + %c = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %d = sext <64 x i8> %a to <64 x i16> + %e = sext <64 x i8> %c to <64 x i16> + %f = sub <64 x i16> %d, %e + ret <64 x i16> %f +} + +define <32 x i32> @vwsub_vx_v32i32(<32 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsub_vx_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = insertelement <32 x i16> undef, i16 %y, i32 0 + %c = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %d = sext <32 x i16> %a to <32 x i32> + %e = sext <32 x i16> %c to <32 x i32> + %f = sub <32 x i32> %d, %e + ret <32 x i32> %f +} + +define <16 x i64> @vwsub_vx_v16i64(<16 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsub_vx_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = insertelement <16 x i32> undef, i32 %y, i64 0 + %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %d = sext <16 x i32> %a to <16 x i64> + %e = sext <16 x i32> %c to <16 x i64> + %f = sub <16 x i64> %d, %e + ret <16 x i64> %f +} + +define <8 x i16> @vwsub_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwsub_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: lb a1, 0(a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwsub.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = sub <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwsub_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwsub_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vwsub.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = sub <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwsub_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwsub_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: lb a1, 0(a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwsub.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwsub_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwsub_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: lh a1, 0(a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwsub.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwsub_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwsub_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vlse32.v v10, (a1), zero +; CHECK-NEXT: vwsub.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwsub_vx_v2i64_i8(<2 x i32>* %x, i8* %y) nounwind { +; RV32-LABEL: vwsub_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lb a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsub.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lb a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsub.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsub_vx_v2i64_i16(<2 x i32>* %x, i16* %y) nounwind { +; RV32-LABEL: vwsub_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lh a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsub.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lh a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsub.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsub_vx_v2i64_i32(<2 x i32>* %x, i32* %y) nounwind { +; RV32-LABEL: vwsub_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsub.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsub.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = sext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsub_vx_v2i64_i64(<2 x i32>* %x, i64* %y) nounwind { +; RV32-LABEL: vwsub_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsub.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vlse64.v v10, (a1), zero +; RV64-NEXT: vwsub.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -0,0 +1,881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define <2 x i16> @vwsubu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwsubu_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = zext <2 x i8> %a to <2 x i16> + %d = zext <2 x i8> %b to <2 x i16> + %e = sub <2 x i16> %c, %d + ret <2 x i16> %e +} + +define <4 x i16> @vwsubu_v4i16(<4 x i8>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwsubu_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = zext <4 x i8> %a to <4 x i16> + %d = zext <4 x i8> %b to <4 x i16> + %e = sub <4 x i16> %c, %d + ret <4 x i16> %e +} + +define <2 x i32> @vwsubu_v2i32(<2 x i16>* %x, <2 x i16>* %y) { +; CHECK-LABEL: vwsubu_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = load <2 x i16>, <2 x i16>* %y + %c = zext <2 x i16> %a to <2 x i32> + %d = zext <2 x i16> %b to <2 x i32> + %e = sub <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <8 x i16> @vwsubu_v8i16(<8 x i8>* %x, <8 x i8>* %y) { +; CHECK-LABEL: vwsubu_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load <8 x i8>, <8 x i8>* %y + %c = zext <8 x i8> %a to <8 x i16> + %d = zext <8 x i8> %b to <8 x i16> + %e = sub <8 x i16> %c, %d + ret <8 x i16> %e +} + +define <4 x i32> @vwsubu_v4i32(<4 x i16>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwsubu_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = zext <4 x i16> %a to <4 x i32> + %d = zext <4 x i16> %b to <4 x i32> + %e = sub <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <2 x i64> @vwsubu_v2i64(<2 x i32>* %x, <2 x i32>* %y) { +; CHECK-LABEL: vwsubu_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load <2 x i32>, <2 x i32>* %y + %c = zext <2 x i32> %a to <2 x i64> + %d = zext <2 x i32> %b to <2 x i64> + %e = sub <2 x i64> %c, %d + ret <2 x i64> %e +} + +define <16 x i16> @vwsubu_v16i16(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: vwsubu_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v11, (a1) +; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = zext <16 x i8> %a to <16 x i16> + %d = zext <16 x i8> %b to <16 x i16> + %e = sub <16 x i16> %c, %d + ret <16 x i16> %e +} + +define <8 x i32> @vwsubu_v8i32(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: vwsubu_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = zext <8 x i16> %a to <8 x i32> + %d = zext <8 x i16> %b to <8 x i32> + %e = sub <8 x i32> %c, %d + ret <8 x i32> %e +} + +define <4 x i64> @vwsubu_v4i64(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: vwsubu_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = zext <4 x i32> %a to <4 x i64> + %d = zext <4 x i32> %b to <4 x i64> + %e = sub <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <32 x i16> @vwsubu_v32i16(<32 x i8>* %x, <32 x i8>* %y) { +; CHECK-LABEL: vwsubu_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = zext <32 x i8> %a to <32 x i16> + %d = zext <32 x i8> %b to <32 x i16> + %e = sub <32 x i16> %c, %d + ret <32 x i16> %e +} + +define <16 x i32> @vwsubu_v16i32(<16 x i16>* %x, <16 x i16>* %y) { +; CHECK-LABEL: vwsubu_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = zext <16 x i16> %a to <16 x i32> + %d = zext <16 x i16> %b to <16 x i32> + %e = sub <16 x i32> %c, %d + ret <16 x i32> %e +} + +define <8 x i64> @vwsubu_v8i64(<8 x i32>* %x, <8 x i32>* %y) { +; CHECK-LABEL: vwsubu_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = zext <8 x i32> %a to <8 x i64> + %d = zext <8 x i32> %b to <8 x i64> + %e = sub <8 x i64> %c, %d + ret <8 x i64> %e +} + +define <64 x i16> @vwsubu_v64i16(<64 x i8>* %x, <64 x i8>* %y) { +; CHECK-LABEL: vwsubu_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = load <64 x i8>, <64 x i8>* %y + %c = zext <64 x i8> %a to <64 x i16> + %d = zext <64 x i8> %b to <64 x i16> + %e = sub <64 x i16> %c, %d + ret <64 x i16> %e +} + +define <32 x i32> @vwsubu_v32i32(<32 x i16>* %x, <32 x i16>* %y) { +; CHECK-LABEL: vwsubu_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = load <32 x i16>, <32 x i16>* %y + %c = zext <32 x i16> %a to <32 x i32> + %d = zext <32 x i16> %b to <32 x i32> + %e = sub <32 x i32> %c, %d + ret <32 x i32> %e +} + +define <16 x i64> @vwsubu_v16i64(<16 x i32>* %x, <16 x i32>* %y) { +; CHECK-LABEL: vwsubu_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = load <16 x i32>, <16 x i32>* %y + %c = zext <16 x i32> %a to <16 x i64> + %d = zext <16 x i32> %b to <16 x i64> + %e = sub <16 x i64> %c, %d + ret <16 x i64> %e +} + +define <128 x i16> @vwsubu_v128i16(<128 x i8>* %x, <128 x i8>* %y) nounwind { +; CHECK-LABEL: vwsubu_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <128 x i8>, <128 x i8>* %x + %b = load <128 x i8>, <128 x i8>* %y + %c = zext <128 x i8> %a to <128 x i16> + %d = zext <128 x i8> %b to <128 x i16> + %e = sub <128 x i16> %c, %d + ret <128 x i16> %e +} + +define <64 x i32> @vwsubu_v64i32(<64 x i16>* %x, <64 x i16>* %y) nounwind { +; CHECK-LABEL: vwsubu_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %x + %b = load <64 x i16>, <64 x i16>* %y + %c = zext <64 x i16> %a to <64 x i32> + %d = zext <64 x i16> %b to <64 x i32> + %e = sub <64 x i32> %c, %d + ret <64 x i32> %e +} + +define <32 x i64> @vwsubu_v32i64(<32 x i32>* %x, <32 x i32>* %y) nounwind { +; CHECK-LABEL: vwsubu_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %x + %b = load <32 x i32>, <32 x i32>* %y + %c = zext <32 x i32> %a to <32 x i64> + %d = zext <32 x i32> %b to <32 x i64> + %e = sub <32 x i64> %c, %d + ret <32 x i64> %e +} + +define <2 x i32> @vwsubu_v2i32_v2i8(<2 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: vwsubu_v2i32_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vwsubu.vv v8, v11, v10 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = load <2 x i8>, <2 x i8>* %y + %c = zext <2 x i8> %a to <2 x i32> + %d = zext <2 x i8> %b to <2 x i32> + %e = sub <2 x i32> %c, %d + ret <2 x i32> %e +} + +define <4 x i32> @vwsubu_v4i32_v4i8_v4i16(<4 x i8>* %x, <4 x i16>* %y) { +; CHECK-LABEL: vwsubu_v4i32_v4i8_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vwsubu.vv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = load <4 x i16>, <4 x i16>* %y + %c = zext <4 x i8> %a to <4 x i32> + %d = zext <4 x i16> %b to <4 x i32> + %e = sub <4 x i32> %c, %d + ret <4 x i32> %e +} + +define <4 x i64> @vwsubu_v4i64_v4i32_v4i8(<4 x i32>* %x, <4 x i8>* %y) { +; CHECK-LABEL: vwsubu_v4i64_v4i32_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vzext.vf4 v11, v8 +; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i8>, <4 x i8>* %y + %c = zext <4 x i32> %a to <4 x i64> + %d = zext <4 x i8> %b to <4 x i64> + %e = sub <4 x i64> %c, %d + ret <4 x i64> %e +} + +define <2 x i16> @vwsubu_vx_v2i16(<2 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i8>, <2 x i8>* %x + %b = insertelement <2 x i8> undef, i8 %y, i32 0 + %c = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer + %d = zext <2 x i8> %a to <2 x i16> + %e = zext <2 x i8> %c to <2 x i16> + %f = sub <2 x i16> %d, %e + ret <2 x i16> %f +} + +define <4 x i16> @vwsubu_vx_v4i16(<4 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %b = insertelement <4 x i8> undef, i8 %y, i32 0 + %c = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer + %d = zext <4 x i8> %a to <4 x i16> + %e = zext <4 x i8> %c to <4 x i16> + %f = sub <4 x i16> %d, %e + ret <4 x i16> %f +} + +define <2 x i32> @vwsubu_vx_v2i32(<2 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsubu_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i16>, <2 x i16>* %x + %b = insertelement <2 x i16> undef, i16 %y, i32 0 + %c = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer + %d = zext <2 x i16> %a to <2 x i32> + %e = zext <2 x i16> %c to <2 x i32> + %f = sub <2 x i32> %d, %e + ret <2 x i32> %f +} + +define <8 x i16> @vwsubu_vx_v8i16(<8 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = insertelement <8 x i8> undef, i8 %y, i32 0 + %c = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer + %d = zext <8 x i8> %a to <8 x i16> + %e = zext <8 x i8> %c to <8 x i16> + %f = sub <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <4 x i32> @vwsubu_vx_v4i32(<4 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsubu_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = insertelement <4 x i16> undef, i16 %y, i32 0 + %c = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer + %d = zext <4 x i16> %a to <4 x i32> + %e = zext <4 x i16> %c to <4 x i32> + %f = sub <4 x i32> %d, %e + ret <4 x i32> %f +} + +define <2 x i64> @vwsubu_vx_v2i64(<2 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsubu_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = insertelement <2 x i32> undef, i32 %y, i64 0 + %c = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer + %d = zext <2 x i32> %a to <2 x i64> + %e = zext <2 x i32> %c to <2 x i64> + %f = sub <2 x i64> %d, %e + ret <2 x i64> %f +} + +define <16 x i16> @vwsubu_vx_v16i16(<16 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = insertelement <16 x i8> undef, i8 %y, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %d = zext <16 x i8> %a to <16 x i16> + %e = zext <16 x i8> %c to <16 x i16> + %f = sub <16 x i16> %d, %e + ret <16 x i16> %f +} + +define <8 x i32> @vwsubu_vx_v8i32(<8 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsubu_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = insertelement <8 x i16> undef, i16 %y, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %d = zext <8 x i16> %a to <8 x i32> + %e = zext <8 x i16> %c to <8 x i32> + %f = sub <8 x i32> %d, %e + ret <8 x i32> %f +} + +define <4 x i64> @vwsubu_vx_v4i64(<4 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsubu_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = insertelement <4 x i32> undef, i32 %y, i64 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %d = zext <4 x i32> %a to <4 x i64> + %e = zext <4 x i32> %c to <4 x i64> + %f = sub <4 x i64> %d, %e + ret <4 x i64> %f +} + +define <32 x i16> @vwsubu_vx_v32i16(<32 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = insertelement <32 x i8> undef, i8 %y, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %d = zext <32 x i8> %a to <32 x i16> + %e = zext <32 x i8> %c to <32 x i16> + %f = sub <32 x i16> %d, %e + ret <32 x i16> %f +} + +define <16 x i32> @vwsubu_vx_v16i32(<16 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsubu_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = insertelement <16 x i16> undef, i16 %y, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %d = zext <16 x i16> %a to <16 x i32> + %e = zext <16 x i16> %c to <16 x i32> + %f = sub <16 x i32> %d, %e + ret <16 x i32> %f +} + +define <8 x i64> @vwsubu_vx_v8i64(<8 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsubu_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = insertelement <8 x i32> undef, i32 %y, i64 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %d = zext <8 x i32> %a to <8 x i64> + %e = zext <8 x i32> %c to <8 x i64> + %f = sub <8 x i64> %d, %e + ret <8 x i64> %f +} + +define <64 x i16> @vwsubu_vx_v64i16(<64 x i8>* %x, i8 %y) { +; CHECK-LABEL: vwsubu_vx_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <64 x i8>, <64 x i8>* %x + %b = insertelement <64 x i8> undef, i8 %y, i32 0 + %c = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %d = zext <64 x i8> %a to <64 x i16> + %e = zext <64 x i8> %c to <64 x i16> + %f = sub <64 x i16> %d, %e + ret <64 x i16> %f +} + +define <32 x i32> @vwsubu_vx_v32i32(<32 x i16>* %x, i16 %y) { +; CHECK-LABEL: vwsubu_vx_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = insertelement <32 x i16> undef, i16 %y, i32 0 + %c = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %d = zext <32 x i16> %a to <32 x i32> + %e = zext <32 x i16> %c to <32 x i32> + %f = sub <32 x i32> %d, %e + ret <32 x i32> %f +} + +define <16 x i64> @vwsubu_vx_v16i64(<16 x i32>* %x, i32 %y) { +; CHECK-LABEL: vwsubu_vx_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: ret + %a = load <16 x i32>, <16 x i32>* %x + %b = insertelement <16 x i32> undef, i32 %y, i64 0 + %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %d = zext <16 x i32> %a to <16 x i64> + %e = zext <16 x i32> %c to <16 x i64> + %f = sub <16 x i64> %d, %e + ret <16 x i64> %f +} + +define <8 x i16> @vwsubu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwsubu_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwsubu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = sub <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwsubu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwsubu_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vwsubu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = sub <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwsubu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwsubu_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwsubu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwsubu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwsubu_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: lhu a1, 0(a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwsubu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwsubu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwsubu_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vlse32.v v10, (a1), zero +; CHECK-NEXT: vwsubu.wv v8, v10, v9 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = sub <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwsubu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) nounwind { +; RV32-LABEL: vwsubu_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsubu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsubu.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsubu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) nounwind { +; RV32-LABEL: vwsubu_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lhu a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsubu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lhu a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsubu.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsubu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) nounwind { +; RV32-LABEL: vwsubu_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsubu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: lwu a1, 0(a1) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64-NEXT: vwsubu.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = zext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwsubu_vx_v2i64_i64(<2 x i32>* %x, i64* %y) nounwind { +; RV32-LABEL: vwsubu_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vwsubu.wv v8, v10, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vlse64.v v10, (a1), zero +; RV64-NEXT: vwsubu.wv v8, v10, v9 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = sub <2 x i64> %e, %f + ret <2 x i64> %g +}