diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -286,6 +286,12 @@ VFWMUL_VL, + // Widening ternary operations with a mask as the fourth operand and VL as the + // fourth operand. + VWMACC_VL, + VWMACCU_VL, + VWMACCSU_VL, + // Narrowing logical shift right. // Operands are (source, shift, passthru, mask, vl) VNSRL_VL, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11921,6 +11921,49 @@ return convertFromScalableVector(VT, Res, DAG, Subtarget); } +static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(N->getOpcode() == RISCVISD::ADD_VL); + SDValue Addend = N->getOperand(0); + SDValue MulOp = N->getOperand(1); + SDValue AddMergeOp = N->getOperand(2); + + if (!AddMergeOp.isUndef()) + return SDValue(); + + auto IsVWMulOpc = [](unsigned Opc) { + switch (Opc) { + case RISCVISD::VWMUL_VL: + case RISCVISD::VWMULU_VL: + case RISCVISD::VWMULSU_VL: + return true; + default: + return false; + } + }; + + if (!IsVWMulOpc(MulOp.getOpcode())) + std::swap(Addend, MulOp); + + if (!IsVWMulOpc(MulOp.getOpcode())) + return SDValue(); + + SDValue AddMask = N->getOperand(3); + SDValue AddVL = N->getOperand(4); + SDValue MulMask = MulOp.getOperand(3); + SDValue MulVL = MulOp.getOperand(4); + + if (AddMask != MulMask || AddVL != MulVL) + return SDValue(); + + unsigned Opc = RISCVISD::VWMACC_VL + MulOp.getOpcode() - RISCVISD::VWMUL_VL; + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Ops[] = {MulOp.getOperand(0), MulOp.getOperand(1), Addend, AddMask, + AddVL}; + return DAG.getNode(Opc, DL, VT, Ops); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12331,6 +12374,10 @@ break; } case RISCVISD::ADD_VL: + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI)) + return V; + else + return combineToVWMACC(N, DAG, Subtarget); case RISCVISD::SUB_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: @@ -15460,6 +15507,9 @@ NODE_NAME_CASE(VWSUB_W_VL) NODE_NAME_CASE(VWSUBU_W_VL) NODE_NAME_CASE(VFWMUL_VL) + NODE_NAME_CASE(VWMACC_VL) + NODE_NAME_CASE(VWMACCU_VL) + NODE_NAME_CASE(VWMACCSU_VL) NODE_NAME_CASE(VNSRL_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -390,6 +390,19 @@ def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWIntBinOp_VL, []>; def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWIntBinOp_VL, []>; +def SDT_RISCVVWIntTernOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>, + SDTCisInt<1>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisSameAs<1, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameNumEltsAs<1, 4>, + SDTCVecEltisVT<4, i1>, + SDTCisVT<5, XLenVT>]>; +def riscv_vwmacc_vl : SDNode<"RISCVISD::VWMACC_VL", SDT_RISCVVWIntTernOp_VL, [SDNPCommutative]>; +def riscv_vwmaccu_vl : SDNode<"RISCVISD::VWMACCU_VL", SDT_RISCVVWIntTernOp_VL, [SDNPCommutative]>; +def riscv_vwmaccsu_vl : SDNode<"RISCVISD::VWMACCSU_VL", SDT_RISCVVWIntTernOp_VL, [SDNPCommutative]>; + def SDT_RISCVVWFPBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisFP<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>, @@ -1532,6 +1545,31 @@ } } +multiclass VPatWidenMultiplyAddVL_VV_VX2 { + foreach vtiTowti = AllWidenableIntVectors in { + defvar vti = vtiTowti.Vti; + defvar wti = vtiTowti.Wti; + let Predicates = !listconcat(GetVTypePredicates.Predicates, + GetVTypePredicates.Predicates) in { + def : Pat<(vwmacc_op (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2), + (wti.Vector wti.RegClass:$rd), + vti.Mask:$mask, VLOpFrag), + (!cast(instr_name#"_VV_"#vti.LMul.MX#"_MASK") + wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, + vti.Mask:$mask, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(vwmacc_op (SplatPat XLenVT:$rs1), + (vti.Vector vti.RegClass:$rs2), + (wti.Vector wti.RegClass:$rd), + vti.Mask:$mask, VLOpFrag), + (!cast(instr_name#"_VX_"#vti.LMul.MX#"_MASK") + wti.RegClass:$rd, vti.ScalarRegClass:$rs1, + vti.RegClass:$rs2, vti.Mask:$mask, GPR:$vl, vti.Log2SEW, + TAIL_AGNOSTIC)>; + } + } +} + multiclass VPatWidenMultiplyAddVL_VV_VX { foreach vtiTowti = AllWidenableIntVectors in { defvar vti = vtiTowti.Vti; @@ -1829,25 +1867,22 @@ defm : VPatMultiplyAccVL_VV_VX; // 11.14. Vector Widening Integer Multiply-Add Instructions -defm : VPatWidenMultiplyAddVL_VV_VX; -defm : VPatWidenMultiplyAddVL_VV_VX; +defm : VPatWidenMultiplyAddVL_VV_VX2; +defm : VPatWidenMultiplyAddVL_VV_VX2; +defm : VPatWidenMultiplyAddVL_VV_VX2; defm : VPatWidenMultiplyAddVL_VV_VX; foreach vtiTowti = AllWidenableIntVectors in { defvar vti = vtiTowti.Vti; defvar wti = vtiTowti.Wti; let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in - def : Pat<(wti.Vector - (riscv_add_vl wti.RegClass:$rd, - (riscv_vwmulsu_vl_oneuse (vti.Vector vti.RegClass:$rs1), - (SplatPat XLenVT:$rs2), - srcvalue, - (vti.Mask true_mask), - VLOpFrag), - srcvalue, (vti.Mask true_mask),VLOpFrag)), - (!cast("PseudoVWMACCUS_VX_" # vti.LMul.MX) - wti.RegClass:$rd, vti.ScalarRegClass:$rs2, vti.RegClass:$rs1, - GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_vwmaccsu_vl (vti.Vector vti.RegClass:$rs1), + (SplatPat XLenVT:$rs2), + (wti.Vector wti.RegClass:$rd), + vti.Mask:$mask, VLOpFrag), + (!cast("PseudoVWMACCUS_VX_"#vti.LMul.MX#"_MASK") + wti.RegClass:$rd, vti.ScalarRegClass:$rs2, vti.RegClass:$rs1, + vti.Mask:$mask, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } // 11.15. Vector Integer Merge Instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll --- a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll +++ b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll @@ -7,10 +7,11 @@ define dso_local <16 x i16> @interleave(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: interleave: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -9,10 +9,11 @@ define <4 x half> @interleave_v2f16(<2 x half> %x, <2 x half> %y) { ; CHECK-LABEL: interleave_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %a = shufflevector <2 x half> %x, <2 x half> %y, <4 x i32> @@ -23,10 +24,11 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: interleave_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v9, v8 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v8 +; CHECK-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %a = shufflevector <2 x float> %x, <2 x float> %y, <4 x i32> @@ -95,19 +97,21 @@ define <8 x half> @interleave_v4f16(<4 x half> %x, <4 x half> %y) { ; V128-LABEL: interleave_v4f16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv1r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> @@ -117,19 +121,21 @@ define <8 x float> @interleave_v4f32(<4 x float> %x, <4 x float> %y) { ; V128-LABEL: interleave_v4f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> @@ -140,19 +146,21 @@ define <16 x half> @interleave_v8f16(<8 x half> %x, <8 x half> %y) { ; V128-LABEL: interleave_v8f16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V128-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v8f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu ; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <8 x half> %x, <8 x half> %y, <16 x i32> @@ -162,19 +170,21 @@ define <16 x float> @interleave_v8f32(<8 x float> %x, <8 x float> %y) { ; V128-LABEL: interleave_v8f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; V128-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V128-NEXT: vmv4r.v v8, v12 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v8f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <8 x float> %x, <8 x float> %y, <16 x i32> @@ -184,19 +194,21 @@ define <32 x half> @interleave_v16f16(<16 x half> %x, <16 x half> %y) { ; V128-LABEL: interleave_v16f16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; V128-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V128-NEXT: vmv4r.v v8, v12 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v16f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma +; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <16 x half> %x, <16 x half> %y, <32 x i32> @@ -206,19 +218,21 @@ define <32 x float> @interleave_v16f32(<16 x float> %x, <16 x float> %y) { ; V128-LABEL: interleave_v16f32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vwmaccu.vx v16, a0, v12, v0.t ; V128-NEXT: vmv8r.v v8, v16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v16f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma +; V512-NEXT: vsetivli zero, 16, e32, m1, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv2r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <16 x float> %x, <16 x float> %y, <32 x i32> @@ -229,20 +243,22 @@ ; V128-LABEL: interleave_v32f16: ; V128: # %bb.0: ; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vwmaccu.vx v16, a0, v12, v0.t ; V128-NEXT: vmv8r.v v8, v16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32f16: ; V512: # %bb.0: ; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv2r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <32 x half> %x, <32 x half> %y, <64 x i32> @@ -264,44 +280,50 @@ ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV32-V128-NEXT: li a1, 32 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: vle32.v v24, (a0) +; RV32-V128-NEXT: vle32.v v8, (a0) ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV32-V128-NEXT: vle32.v v16, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, 699051 ; RV32-V128-NEXT: addi a0, a0, -1366 ; RV32-V128-NEXT: vmv.s.x v0, a0 -; RV32-V128-NEXT: vrgather.vv v16, v8, v24 -; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v24, v16, v8 ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t -; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-V128-NEXT: vmv4r.v v24, v8 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vwaddu.vv v0, v8, v24 +; RV32-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v24, v8, v16, v0.t +; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v16, v0, v8 +; RV32-V128-NEXT: vmset.m v0 ; RV32-V128-NEXT: li a0, -1 -; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24 -; RV32-V128-NEXT: vmv8r.v v8, v0 +; RV32-V128-NEXT: vwmaccu.vx v16, a0, v8, v0.t +; RV32-V128-NEXT: vmv8r.v v8, v16 +; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: li a1, 24 ; RV32-V128-NEXT: mul a0, a0, a1 @@ -323,44 +345,50 @@ ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV64-V128-NEXT: li a1, 32 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: vle32.v v24, (a0) +; RV64-V128-NEXT: vle32.v v8, (a0) ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV64-V128-NEXT: vle32.v v16, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, 699051 ; RV64-V128-NEXT: addiw a0, a0, -1366 ; RV64-V128-NEXT: vmv.s.x v0, a0 -; RV64-V128-NEXT: vrgather.vv v16, v8, v24 -; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v24, v16, v8 ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t -; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-V128-NEXT: vmv4r.v v24, v8 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vwaddu.vv v0, v8, v24 +; RV64-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v24, v8, v16, v0.t +; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v16, v0, v8 +; RV64-V128-NEXT: vmset.m v0 ; RV64-V128-NEXT: li a0, -1 -; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24 -; RV64-V128-NEXT: vmv8r.v v8, v0 +; RV64-V128-NEXT: vwmaccu.vx v16, a0, v8, v0.t +; RV64-V128-NEXT: vmv8r.v v8, v16 +; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: li a1, 24 ; RV64-V128-NEXT: mul a0, a0, a1 @@ -371,10 +399,11 @@ ; V512-LABEL: interleave_v32f32: ; V512: # %bb.0: ; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; V512-NEXT: vwaddu.vv v12, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v12, a0, v10 +; V512-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V512-NEXT: vmv4r.v v8, v12 ; V512-NEXT: ret %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> @@ -386,20 +415,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v4f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> @@ -411,20 +442,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v4f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> @@ -481,20 +514,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v8f16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 4 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <8 x half> %x, <8 x half> poison, <8 x i32> @@ -506,20 +541,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; V128-NEXT: vslidedown.vi v12, v8, 4 -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v12, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v8f32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 4 ; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vwmaccu.vx v9, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -9,10 +9,11 @@ define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: interleave_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> @@ -22,10 +23,11 @@ define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: interleave_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> @@ -36,10 +38,11 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: interleave_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v9, v8 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v8 +; CHECK-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> @@ -108,19 +111,21 @@ define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) { ; V128-LABEL: interleave_v4i8: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V128-NEXT: vmv1r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, mu ; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> @@ -131,19 +136,21 @@ define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) { ; V128-LABEL: interleave_v4i16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv1r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> @@ -153,19 +160,21 @@ define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> @@ -178,20 +187,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v9, 2 -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_2: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v9, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> @@ -204,20 +215,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v9, 1 -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_1: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v9, 1 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> @@ -227,19 +240,21 @@ define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { ; V128-LABEL: interleave_v8i8: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; V128-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv1r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v8i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> @@ -250,19 +265,21 @@ define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) { ; V128-LABEL: interleave_v8i16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V128-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v9, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v8 +; V128-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v8i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, mu ; V512-NEXT: vwaddu.vv v10, v9, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v8 +; V512-NEXT: vwmaccu.vx v10, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> @@ -272,19 +289,21 @@ define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) { ; V128-LABEL: interleave_v8i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; V128-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V128-NEXT: vmv4r.v v8, v12 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v8i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> @@ -294,19 +313,21 @@ define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) { ; V128-LABEL: interleave_v16i8: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; V128-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v9 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v9 +; V128-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v16i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e8, mf4, ta, ma +; V512-NEXT: vsetivli zero, 16, e8, mf4, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> @@ -316,19 +337,21 @@ define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) { ; V128-LABEL: interleave_v16i16: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; V128-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V128-NEXT: vmv4r.v v8, v12 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v16i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma +; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> @@ -338,19 +361,21 @@ define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) { ; V128-LABEL: interleave_v16i32: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vwmaccu.vx v16, a0, v12, v0.t ; V128-NEXT: vmv8r.v v8, v16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v16i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma +; V512-NEXT: vsetivli zero, 16, e32, m1, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv2r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> @@ -361,20 +386,22 @@ ; V128-LABEL: interleave_v32i8: ; V128: # %bb.0: ; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; V128-NEXT: vsetvli zero, a0, e8, m2, ta, mu ; V128-NEXT: vwaddu.vv v12, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v12, a0, v10 +; V128-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V128-NEXT: vmv4r.v v8, v12 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32i8: ; V512: # %bb.0: ; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> @@ -385,20 +412,22 @@ ; V128-LABEL: interleave_v32i16: ; V128: # %bb.0: ; V128-NEXT: li a0, 32 -; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; V128-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; V128-NEXT: vwaddu.vv v16, v8, v12 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v16, a0, v12 +; V128-NEXT: vwmaccu.vx v16, a0, v12, v0.t ; V128-NEXT: vmv8r.v v8, v16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32i16: ; V512: # %bb.0: ; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; V512-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; V512-NEXT: vwaddu.vv v10, v8, v9 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v10, a0, v9 +; V512-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; V512-NEXT: vmv2r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> @@ -420,44 +449,50 @@ ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV32-V128-NEXT: li a1, 32 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-V128-NEXT: vle32.v v24, (a0) +; RV32-V128-NEXT: vle32.v v8, (a0) ; RV32-V128-NEXT: lui a0, %hi(.LCPI17_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) ; RV32-V128-NEXT: vle32.v v16, (a0) -; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 -; RV32-V128-NEXT: add a0, sp, a0 -; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, 699051 ; RV32-V128-NEXT: addi a0, a0, -1366 ; RV32-V128-NEXT: vmv.s.x v0, a0 -; RV32-V128-NEXT: vrgather.vv v16, v8, v24 -; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 -; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v24, v16, v8 ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 3 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t -; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-V128-NEXT: vmv4r.v v24, v8 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vwaddu.vv v0, v8, v24 +; RV32-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vrgather.vv v24, v8, v16, v0.t +; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v16, v0, v8 +; RV32-V128-NEXT: vmset.m v0 ; RV32-V128-NEXT: li a0, -1 -; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24 -; RV32-V128-NEXT: vmv8r.v v8, v0 +; RV32-V128-NEXT: vwmaccu.vx v16, a0, v8, v0.t +; RV32-V128-NEXT: vmv8r.v v8, v16 +; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: li a1, 24 ; RV32-V128-NEXT: mul a0, a0, a1 @@ -479,44 +514,50 @@ ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV64-V128-NEXT: li a1, 32 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-V128-NEXT: vle32.v v24, (a0) +; RV64-V128-NEXT: vle32.v v8, (a0) ; RV64-V128-NEXT: lui a0, %hi(.LCPI17_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_1) ; RV64-V128-NEXT: vle32.v v16, (a0) -; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 -; RV64-V128-NEXT: add a0, sp, a0 -; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, 699051 ; RV64-V128-NEXT: addiw a0, a0, -1366 ; RV64-V128-NEXT: vmv.s.x v0, a0 -; RV64-V128-NEXT: vrgather.vv v16, v8, v24 -; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 -; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v24, v16, v8 ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 3 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t -; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-V128-NEXT: vmv4r.v v24, v8 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vwaddu.vv v0, v8, v24 +; RV64-V128-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vrgather.vv v24, v8, v16, v0.t +; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v16, v0, v8 +; RV64-V128-NEXT: vmset.m v0 ; RV64-V128-NEXT: li a0, -1 -; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24 -; RV64-V128-NEXT: vmv8r.v v8, v0 +; RV64-V128-NEXT: vwmaccu.vx v16, a0, v8, v0.t +; RV64-V128-NEXT: vmv8r.v v8, v16 +; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: li a1, 24 ; RV64-V128-NEXT: mul a0, a0, a1 @@ -527,10 +568,11 @@ ; V512-LABEL: interleave_v32i32: ; V512: # %bb.0: ; V512-NEXT: li a0, 32 -; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; V512-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; V512-NEXT: vwaddu.vv v12, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v12, a0, v10 +; V512-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; V512-NEXT: vmv4r.v v8, v12 ; V512-NEXT: ret %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> @@ -542,20 +584,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v4i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> @@ -592,20 +636,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v4i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> @@ -617,20 +663,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 2 -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v4i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 2 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> @@ -687,20 +735,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; V128-NEXT: vwaddu.vv v9, v8, v10 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v10 +; V128-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v8i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 4 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> @@ -712,20 +762,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 4 -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; V128-NEXT: vwaddu.vv v9, v10, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v8 +; V128-NEXT: vwmaccu.vx v9, a0, v8, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v8i16: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 4 ; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vwmaccu.vx v9, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> @@ -737,20 +789,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; V128-NEXT: vslidedown.vi v12, v8, 4 -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; V128-NEXT: vwaddu.vv v10, v8, v12 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v10, a0, v12 +; V128-NEXT: vwmaccu.vx v10, a0, v12, v0.t ; V128-NEXT: vmv2r.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_v8i32: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 4 ; V512-NEXT: vwaddu.vv v9, v8, v10 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v10 +; V512-NEXT: vwmaccu.vx v9, a0, v10, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> @@ -764,20 +818,22 @@ ; V128: # %bb.0: ; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; V128-NEXT: vslidedown.vi v10, v8, 1 -; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; V128-NEXT: vwaddu.vv v9, v10, v8 +; V128-NEXT: vmset.m v0 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v9, a0, v8 +; V128-NEXT: vwmaccu.vx v9, a0, v8, v0.t ; V128-NEXT: vmv1r.v v8, v9 ; V128-NEXT: ret ; ; V512-LABEL: unary_interleave_10uu_v4i8: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; V512-NEXT: vslidedown.vi v10, v8, 1 ; V512-NEXT: vwaddu.vv v9, v10, v8 +; V512-NEXT: vmset.m v0 ; V512-NEXT: li a0, -1 -; V512-NEXT: vwmaccu.vx v9, a0, v8 +; V512-NEXT: vwmaccu.vx v9, a0, v8, v0.t ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -320,10 +320,11 @@ ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vrgather.vi v9, v8, 0 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v8, a0, v10 +; CHECK-NEXT: vwmaccu.vx v8, a0, v10, v0.t ; CHECK-NEXT: ret %y = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> %z = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll @@ -5,10 +5,11 @@ define <2 x i16> @vwmacc_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmacc_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +23,11 @@ define <4 x i16> @vwmacc_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmacc_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +41,11 @@ define <2 x i32> @vwmacc_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmacc_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +59,11 @@ define <8 x i16> @vwmacc_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmacc_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +77,11 @@ define <4 x i32> @vwmacc_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmacc_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +95,11 @@ define <2 x i64> @vwmacc_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmacc_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +113,11 @@ define <16 x i16> @vwmacc_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmacc_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +131,11 @@ define <8 x i32> @vwmacc_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmacc_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +149,11 @@ define <4 x i64> @vwmacc_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmacc_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +168,11 @@ ; CHECK-LABEL: vwmacc_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +186,11 @@ define <16 x i32> @vwmacc_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmacc_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +204,11 @@ define <8 x i64> @vwmacc_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmacc_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) ; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +223,11 @@ ; CHECK-LABEL: vwmacc_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +242,11 @@ ; CHECK-LABEL: vwmacc_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +260,11 @@ define <16 x i64> @vwmacc_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmacc_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -263,9 +278,10 @@ define <2 x i16> @vwmacc_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK-LABEL: vwmacc_vx_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -280,9 +296,10 @@ define <4 x i16> @vwmacc_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK-LABEL: vwmacc_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -297,9 +314,10 @@ define <2 x i32> @vwmacc_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -314,9 +332,10 @@ define <8 x i16> @vwmacc_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK-LABEL: vwmacc_vx_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -331,9 +350,10 @@ define <4 x i32> @vwmacc_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -348,9 +368,10 @@ define <2 x i64> @vwmacc_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -365,9 +386,10 @@ define <16 x i16> @vwmacc_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmacc_vx_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -382,9 +404,10 @@ define <8 x i32> @vwmacc_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -399,9 +422,10 @@ define <4 x i64> @vwmacc_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -417,9 +441,10 @@ ; CHECK-LABEL: vwmacc_vx_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -434,9 +459,10 @@ define <16 x i32> @vwmacc_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -451,9 +477,10 @@ define <8 x i64> @vwmacc_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -469,9 +496,10 @@ ; CHECK-LABEL: vwmacc_vx_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -487,9 +515,10 @@ ; CHECK-LABEL: vwmacc_vx_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -504,9 +533,10 @@ define <16 x i64> @vwmacc_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmacc.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll @@ -5,10 +5,11 @@ define <2 x i16> @vwmaccsu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +23,11 @@ define <4 x i16> @vwmaccsu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +41,11 @@ define <2 x i32> @vwmaccsu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +59,11 @@ define <8 x i16> @vwmaccsu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +77,11 @@ define <4 x i32> @vwmaccsu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +95,11 @@ define <2 x i64> @vwmaccsu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +113,11 @@ define <16 x i16> @vwmaccsu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +131,11 @@ define <8 x i32> @vwmaccsu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +149,11 @@ define <4 x i64> @vwmaccsu_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +168,11 @@ ; CHECK-LABEL: vwmaccsu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +186,11 @@ define <16 x i32> @vwmaccsu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +204,11 @@ define <8 x i64> @vwmaccsu_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) ; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +223,11 @@ ; CHECK-LABEL: vwmaccsu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +242,11 @@ ; CHECK-LABEL: vwmaccsu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +260,11 @@ define <16 x i64> @vwmaccsu_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -263,9 +278,10 @@ define <2 x i16> @vwmaccsu_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccsu_vx_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -280,9 +296,10 @@ define <4 x i16> @vwmaccsu_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccsu_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -297,9 +314,10 @@ define <2 x i32> @vwmaccsu_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -314,9 +332,10 @@ define <8 x i16> @vwmaccsu_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccsu_vx_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -331,9 +350,10 @@ define <4 x i32> @vwmaccsu_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -348,9 +368,10 @@ define <2 x i64> @vwmaccsu_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -365,9 +386,10 @@ define <16 x i16> @vwmaccsu_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -382,9 +404,10 @@ define <8 x i32> @vwmaccsu_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -399,9 +422,10 @@ define <4 x i64> @vwmaccsu_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -417,9 +441,10 @@ ; CHECK-LABEL: vwmaccsu_vx_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -434,9 +459,10 @@ define <16 x i32> @vwmaccsu_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -451,9 +477,10 @@ define <8 x i64> @vwmaccsu_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -469,9 +496,10 @@ ; CHECK-LABEL: vwmaccsu_vx_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -487,9 +515,10 @@ ; CHECK-LABEL: vwmaccsu_vx_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -504,9 +533,10 @@ define <16 x i64> @vwmaccsu_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccsu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll @@ -5,10 +5,11 @@ define <2 x i16> @vwmaccu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +23,11 @@ define <4 x i16> @vwmaccu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +41,11 @@ define <2 x i32> @vwmaccu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +59,11 @@ define <8 x i16> @vwmaccu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +77,11 @@ define <4 x i32> @vwmaccu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +95,11 @@ define <2 x i64> @vwmaccu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +113,11 @@ define <16 x i16> @vwmaccu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +131,11 @@ define <8 x i32> @vwmaccu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +149,11 @@ define <4 x i64> @vwmaccu_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v10, v11, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +168,11 @@ ; CHECK-LABEL: vwmaccu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +186,11 @@ define <16 x i32> @vwmaccu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +204,11 @@ define <8 x i64> @vwmaccu_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) ; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v12, v14, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +223,11 @@ ; CHECK-LABEL: vwmaccu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +242,11 @@ ; CHECK-LABEL: vwmaccu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +260,11 @@ define <16 x i64> @vwmaccu_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vv v8, v16, v20, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -263,9 +278,10 @@ define <2 x i16> @vwmaccu_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccu_vx_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -280,9 +296,10 @@ define <4 x i16> @vwmaccu_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccu_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -297,9 +314,10 @@ define <2 x i32> @vwmaccu_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -314,9 +332,10 @@ define <8 x i16> @vwmaccu_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccu_vx_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -331,9 +350,10 @@ define <4 x i32> @vwmaccu_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -348,9 +368,10 @@ define <2 x i64> @vwmaccu_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -365,9 +386,10 @@ define <16 x i16> @vwmaccu_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -382,9 +404,10 @@ define <8 x i32> @vwmaccu_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -399,9 +422,10 @@ define <4 x i64> @vwmaccu_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -417,9 +441,10 @@ ; CHECK-LABEL: vwmaccu_vx_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -434,9 +459,10 @@ define <16 x i32> @vwmaccu_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -451,9 +477,10 @@ define <8 x i64> @vwmaccu_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -469,9 +496,10 @@ ; CHECK-LABEL: vwmaccu_vx_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -487,9 +515,10 @@ ; CHECK-LABEL: vwmaccu_vx_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -504,9 +533,10 @@ define <16 x i64> @vwmaccu_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccu.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll @@ -5,9 +5,10 @@ define <2 x i16> @vwmaccus_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccus_vx_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -22,9 +23,10 @@ define <4 x i16> @vwmaccus_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccus_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -39,9 +41,10 @@ define <2 x i32> @vwmaccus_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -56,9 +59,10 @@ define <8 x i16> @vwmaccus_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccus_vx_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -73,9 +77,10 @@ define <4 x i32> @vwmaccus_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -90,9 +95,10 @@ define <2 x i64> @vwmaccus_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v9, v0.t ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -107,9 +113,10 @@ define <16 x i16> @vwmaccus_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -124,9 +131,10 @@ define <8 x i32> @vwmaccus_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -141,9 +149,10 @@ define <4 x i64> @vwmaccus_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v10, v0.t ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -159,9 +168,10 @@ ; CHECK-LABEL: vwmaccus_vx_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -176,9 +186,10 @@ define <16 x i32> @vwmaccus_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -193,9 +204,10 @@ define <8 x i64> @vwmaccus_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v12, v0.t ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -211,9 +223,10 @@ ; CHECK-LABEL: vwmaccus_vx_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -229,9 +242,10 @@ ; CHECK-LABEL: vwmaccus_vx_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -246,9 +260,10 @@ define <16 x i64> @vwmaccus_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vwmaccus.vx v8, a1, v16, v0.t ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll --- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll @@ -6,12 +6,13 @@ ; RV64-1024-LABEL: interleave256: ; RV64-1024: # %bb.0: # %entry ; RV64-1024-NEXT: li a3, 128 -; RV64-1024-NEXT: vsetvli zero, a3, e16, m2, ta, ma +; RV64-1024-NEXT: vsetvli zero, a3, e16, m2, ta, mu ; RV64-1024-NEXT: vle16.v v8, (a1) ; RV64-1024-NEXT: vle16.v v10, (a2) ; RV64-1024-NEXT: vwaddu.vv v12, v8, v10 +; RV64-1024-NEXT: vmset.m v0 ; RV64-1024-NEXT: li a1, -1 -; RV64-1024-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-1024-NEXT: vwmaccu.vx v12, a1, v10, v0.t ; RV64-1024-NEXT: li a1, 256 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-1024-NEXT: vse16.v v12, (a0) @@ -20,12 +21,13 @@ ; RV64-2048-LABEL: interleave256: ; RV64-2048: # %bb.0: # %entry ; RV64-2048-NEXT: li a3, 128 -; RV64-2048-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; RV64-2048-NEXT: vsetvli zero, a3, e16, m1, ta, mu ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v9, (a2) ; RV64-2048-NEXT: vwaddu.vv v10, v8, v9 +; RV64-2048-NEXT: vmset.m v0 ; RV64-2048-NEXT: li a1, -1 -; RV64-2048-NEXT: vwmaccu.vx v10, a1, v9 +; RV64-2048-NEXT: vwmaccu.vx v10, a1, v9, v0.t ; RV64-2048-NEXT: li a1, 256 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, ma ; RV64-2048-NEXT: vse16.v v10, (a0) @@ -44,26 +46,28 @@ ; RV64-1024-LABEL: interleave512: ; RV64-1024: # %bb.0: # %entry ; RV64-1024-NEXT: li a3, 256 -; RV64-1024-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV64-1024-NEXT: vle16.v v8, (a1) -; RV64-1024-NEXT: vle16.v v12, (a2) -; RV64-1024-NEXT: vwaddu.vv v16, v8, v12 +; RV64-1024-NEXT: vsetvli zero, a3, e16, m4, ta, mu +; RV64-1024-NEXT: vle16.v v16, (a1) +; RV64-1024-NEXT: vle16.v v20, (a2) +; RV64-1024-NEXT: vwaddu.vv v8, v16, v20 +; RV64-1024-NEXT: vmset.m v0 ; RV64-1024-NEXT: li a1, -1 -; RV64-1024-NEXT: vwmaccu.vx v16, a1, v12 +; RV64-1024-NEXT: vwmaccu.vx v8, a1, v20, v0.t ; RV64-1024-NEXT: li a1, 512 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV64-1024-NEXT: vse16.v v16, (a0) +; RV64-1024-NEXT: vse16.v v8, (a0) ; RV64-1024-NEXT: ret ; ; RV64-2048-LABEL: interleave512: ; RV64-2048: # %bb.0: # %entry ; RV64-2048-NEXT: li a3, 256 -; RV64-2048-NEXT: vsetvli zero, a3, e16, m2, ta, ma +; RV64-2048-NEXT: vsetvli zero, a3, e16, m2, ta, mu ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v10, (a2) ; RV64-2048-NEXT: vwaddu.vv v12, v8, v10 +; RV64-2048-NEXT: vmset.m v0 ; RV64-2048-NEXT: li a1, -1 -; RV64-2048-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-2048-NEXT: vwmaccu.vx v12, a1, v10, v0.t ; RV64-2048-NEXT: li a1, 512 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-2048-NEXT: vse16.v v12, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -20,10 +20,11 @@ ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 16 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: vwmaccu.vx v12, a1, v10 +; CHECK-NEXT: vwmaccu.vx v12, a1, v10, v0.t ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: ret @@ -34,10 +35,11 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: vector_interleave_v16i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) @@ -47,10 +49,11 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vector_interleave_v8i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) @@ -103,10 +106,11 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) { ; CHECK-LABEL: vector_interleave_v4f16_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) @@ -116,10 +120,11 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-LABEL: vector_interleave_v8f16_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) @@ -129,10 +134,11 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vector_interleave_v4f32_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) @@ -142,10 +148,11 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: vector_interleave_v16f16_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) @@ -155,10 +162,11 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: vector_interleave_v8f32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -8,15 +8,16 @@ ; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: vwaddu.vv v16, v8, v12 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v16, a0, v12 +; CHECK-NEXT: vwmaccu.vx v16, a0, v12, v0.t ; CHECK-NEXT: vmsne.vi v8, v18, 0 ; CHECK-NEXT: vmsne.vi v0, v16, 0 ; CHECK-NEXT: csrr a0, vlenb @@ -32,10 +33,11 @@ define @vector_interleave_nxv16i16_nxv8i16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) @@ -45,10 +47,11 @@ define @vector_interleave_nxv8i32_nxv4i32( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) @@ -84,10 +87,11 @@ define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma @@ -104,10 +108,11 @@ define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) @@ -117,10 +122,11 @@ define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 +; CHECK-NEXT: vwmaccu.vx v10, a0, v9, v0.t ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) @@ -130,10 +136,11 @@ define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) @@ -143,10 +150,11 @@ define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vwaddu.vv v12, v8, v10 +; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v12, a0, v10 +; CHECK-NEXT: vwmaccu.vx v12, a0, v10, v0.t ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll @@ -12,8 +12,10 @@ define @vwmacc_vv_nxv1i32_unmasked_tu( %a, ; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vwmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vwmacc.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %b, %c, i32 zeroext %evl) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-vp.ll rename from llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll rename to llvm/test/CodeGen/RISCV/rvv/vwmaccsu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-vp.ll @@ -13,8 +13,10 @@ define @vwmacc_vv_nxv1i32_unmasked_tu( %a, ; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vwmaccsu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vwmaccsu.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %b, %c, i32 zeroext %evl) { @@ -31,8 +33,10 @@ define @vwmacc_vv_nxv1i32_commute_unmasked_tu( %a, ; CHECK-LABEL: vwmacc_vv_nxv1i32_commute_unmasked_tu: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vwmaccsu.vv v10, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vwmaccsu.vv v10, v9, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %b, %c, i32 zeroext %evl) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll @@ -12,8 +12,10 @@ define @vwmacc_vv_nxv1i32_unmasked_tu( %a, ; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vwmaccu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vwmaccu.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %b, %c, i32 zeroext %evl) {