diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -683,6 +683,46 @@ defm : VPatBinarySDNodeExt_V_WX; } +multiclass VPatWidenReductionVL { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + defvar wti_m1 = !cast("VI"#wti.SEW#"M1"); + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask true_mask), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX) + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask V0), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + +multiclass VPatWidenReductionVL_Ext_VL { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + defvar wti_m1 = !cast("VI"#wti.SEW#"M1"); + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)), + VR:$rs2, (vti.Mask true_mask), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX) + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)), + VR:$rs2, (vti.Mask V0), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// @@ -1073,6 +1113,13 @@ defm : VPatReductionVL; defm : VPatReductionVL; defm : VPatReductionVL; + +// 15.2. Vector Widening Integer Reduction Instructions +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL_Ext_VL; +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL_Ext_VL; } // Predicates = [HasVInstructions] // 15.3. Vector Single-Width Floating-Point Reduction Instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -173,6 +173,36 @@ ret i16 %red } +define i16 @vwreduce_add_v1i16(<1 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.x.s a0, v9 +; CHECK-NEXT: ret + %v = load <1 x i8>, <1 x i8>* %x + %e = sext <1 x i8> %v to <1 x i16> + %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v1i16(<1 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.x.s a0, v9 +; CHECK-NEXT: ret + %v = load <1 x i8>, <1 x i8>* %x + %e = zext <1 x i8> %v to <1 x i16> + %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) define i16 @vreduce_add_v2i16(<2 x i16>* %x) { @@ -189,6 +219,42 @@ ret i16 %red } +define i16 @vwreduce_add_v2i16(<2 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <2 x i8>, <2 x i8>* %x + %e = sext <2 x i8> %v to <2 x i16> + %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v2i16(<2 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <2 x i8>, <2 x i8>* %x + %e = zext <2 x i8> %v to <2 x i16> + %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) define i16 @vreduce_add_v4i16(<4 x i16>* %x) { @@ -205,6 +271,42 @@ ret i16 %red } +define i16 @vwreduce_add_v4i16(<4 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <4 x i8>, <4 x i8>* %x + %e = sext <4 x i8> %v to <4 x i16> + %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v4i16(<4 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <4 x i8>, <4 x i8>* %x + %e = zext <4 x i8> %v to <4 x i16> + %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) define i16 @vreduce_add_v8i16(<8 x i16>* %x) { @@ -221,6 +323,42 @@ ret i16 %red } +define i16 @vwreduce_add_v8i16(<8 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <8 x i8>, <8 x i8>* %x + %e = sext <8 x i8> %v to <8 x i16> + %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v8i16(<8 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <8 x i8>, <8 x i8>* %x + %e = zext <8 x i8> %v to <8 x i16> + %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) define i16 @vreduce_add_v16i16(<16 x i16>* %x) { @@ -237,6 +375,42 @@ ret i16 %red } +define i16 @vwreduce_add_v16i16(<16 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <16 x i8>, <16 x i8>* %x + %e = sext <16 x i8> %v to <16 x i16> + %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v16i16(<16 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <16 x i8>, <16 x i8>* %x + %e = zext <16 x i8> %v to <16 x i16> + %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) define i16 @vreduce_add_v32i16(<32 x i16>* %x) { @@ -256,6 +430,44 @@ ret i16 %red } +define i16 @vwreduce_add_v32i16(<32 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <32 x i8>, <32 x i8>* %x + %e = sext <32 x i8> %v to <32 x i16> + %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v32i16(<32 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <32 x i8>, <32 x i8>* %x + %e = zext <32 x i8> %v to <32 x i16> + %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) define i16 @vreduce_add_v64i16(<64 x i16>* %x) { @@ -275,6 +487,44 @@ ret i16 %red } +define i16 @vwreduce_add_v64i16(<64 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <64 x i8>, <64 x i8>* %x + %e = sext <64 x i8> %v to <64 x i16> + %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v64i16(<64 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <64 x i8>, <64 x i8>* %x + %e = zext <64 x i8> %v to <64 x i16> + %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>) define i16 @vreduce_add_v128i16(<128 x i16>* %x) { @@ -297,6 +547,52 @@ ret i16 %red } +define i16 @vwreduce_add_v128i16(<128 x i8>* %x) { +; CHECK-LABEL: vwreduce_add_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwadd.vv v24, v8, v16 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <128 x i8>, <128 x i8>* %x + %e = sext <128 x i8> %v to <128 x i16> + %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e) + ret i16 %red +} + +define i16 @vwreduce_uadd_v128i16(<128 x i8>* %x) { +; CHECK-LABEL: vwreduce_uadd_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu +; CHECK-NEXT: vwaddu.vv v24, v8, v16 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <128 x i8>, <128 x i8>* %x + %e = zext <128 x i8> %v to <128 x i16> + %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e) + ret i16 %red +} + declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>) define i32 @vreduce_add_v1i32(<1 x i32>* %x) { @@ -311,6 +607,36 @@ ret i32 %red } +define i32 @vwreduce_add_v1i32(<1 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.x.s a0, v9 +; CHECK-NEXT: ret + %v = load <1 x i16>, <1 x i16>* %x + %e = sext <1 x i16> %v to <1 x i32> + %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v1i32(<1 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.x.s a0, v9 +; CHECK-NEXT: ret + %v = load <1 x i16>, <1 x i16>* %x + %e = zext <1 x i16> %v to <1 x i32> + %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) define i32 @vreduce_add_v2i32(<2 x i32>* %x) { @@ -327,6 +653,42 @@ ret i32 %red } +define i32 @vwreduce_add_v2i32(<2 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <2 x i16>, <2 x i16>* %x + %e = sext <2 x i16> %v to <2 x i32> + %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v2i32(<2 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <2 x i16>, <2 x i16>* %x + %e = zext <2 x i16> %v to <2 x i32> + %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) define i32 @vreduce_add_v4i32(<4 x i32>* %x) { @@ -343,6 +705,42 @@ ret i32 %red } +define i32 @vwreduce_add_v4i32(<4 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <4 x i16>, <4 x i16>* %x + %e = sext <4 x i16> %v to <4 x i32> + %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v4i32(<4 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <4 x i16>, <4 x i16>* %x + %e = zext <4 x i16> %v to <4 x i32> + %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @vreduce_add_v8i32(<8 x i32>* %x) { @@ -359,8 +757,44 @@ ret i32 %red } -declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) - +define i32 @vwreduce_add_v8i32(<8 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <8 x i16>, <8 x i16>* %x + %e = sext <8 x i16> %v to <8 x i32> + %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v8i32(<8 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <8 x i16>, <8 x i16>* %x + %e = zext <8 x i16> %v to <8 x i32> + %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e) + ret i32 %red +} + +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) + define i32 @vreduce_add_v16i32(<16 x i32>* %x) { ; CHECK-LABEL: vreduce_add_v16i32: ; CHECK: # %bb.0: @@ -375,6 +809,42 @@ ret i32 %red } +define i32 @vwreduce_add_v16i32(<16 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <16 x i16>, <16 x i16>* %x + %e = sext <16 x i16> %v to <16 x i32> + %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v16i32(<16 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <16 x i16>, <16 x i16>* %x + %e = zext <16 x i16> %v to <16 x i32> + %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) define i32 @vreduce_add_v32i32(<32 x i32>* %x) { @@ -394,6 +864,44 @@ ret i32 %red } +define i32 @vwreduce_add_v32i32(<32 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <32 x i16>, <32 x i16>* %x + %e = sext <32 x i16> %v to <32 x i32> + %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v32i32(<32 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <32 x i16>, <32 x i16>* %x + %e = zext <32 x i16> %v to <32 x i32> + %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) define i32 @vreduce_add_v64i32(<64 x i32>* %x) { @@ -416,6 +924,52 @@ ret i32 %red } +define i32 @vwreduce_add_v64i32(<64 x i16>* %x) { +; CHECK-LABEL: vwreduce_add_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwadd.vv v24, v8, v16 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <64 x i16>, <64 x i16>* %x + %e = sext <64 x i16> %v to <64 x i32> + %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_v64i32(<64 x i16>* %x) { +; CHECK-LABEL: vwreduce_uadd_v64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vwaddu.vv v24, v8, v16 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %v = load <64 x i16>, <64 x i16>* %x + %e = zext <64 x i16> %v to <64 x i32> + %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e) + ret i32 %red +} + declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) define i64 @vreduce_add_v1i64(<1 x i64>* %x) { @@ -440,6 +994,60 @@ ret i64 %red } +define i64 @vwreduce_add_v1i64(<1 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v9, v8 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsrl.vx v8, v9, a0 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %v = load <1 x i32>, <1 x i32>* %x + %e = sext <1 x i32> %v to <1 x i64> + %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v1i64(<1 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsrl.vx v8, v9, a0 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %v = load <1 x i32>, <1 x i32>* %x + %e = zext <1 x i32> %v to <1 x i64> + %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) define i64 @vreduce_add_v2i64(<2 x i64>* %x) { @@ -469,6 +1077,74 @@ ret i64 %red } +define i64 @vwreduce_add_v2i64(<2 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <2 x i32>, <2 x i32>* %x + %e = sext <2 x i32> %v to <2 x i64> + %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v2i64(<2 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <2 x i32>, <2 x i32>* %x + %e = zext <2 x i32> %v to <2 x i64> + %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) define i64 @vreduce_add_v4i64(<4 x i64>* %x) { @@ -498,6 +1174,74 @@ ret i64 %red } +define i64 @vwreduce_add_v4i64(<4 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %x + %e = sext <4 x i32> %v to <4 x i64> + %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v4i64(<4 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %x + %e = zext <4 x i32> %v to <4 x i64> + %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) define i64 @vreduce_add_v8i64(<8 x i64>* %x) { @@ -527,6 +1271,74 @@ ret i64 %red } +define i64 @vwreduce_add_v8i64(<8 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v10 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v10 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %x + %e = sext <8 x i32> %v to <8 x i64> + %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v8i64(<8 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v10 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v10 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %x + %e = zext <8 x i32> %v to <8 x i64> + %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) define i64 @vreduce_add_v16i64(<16 x i64>* %x) { @@ -556,6 +1368,74 @@ ret i64 %red } +define i64 @vwreduce_add_v16i64(<16 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v12 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v12 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <16 x i32>, <16 x i32>* %x + %e = sext <16 x i32> %v to <16 x i64> + %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v16i64(<16 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v12 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v12 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <16 x i32>, <16 x i32>* %x + %e = zext <16 x i32> %v to <16 x i64> + %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) define i64 @vreduce_add_v32i64(<32 x i64>* %x) { @@ -591,6 +1471,88 @@ ret i64 %red } +define i64 @vwreduce_add_v32i64(<32 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwadd.vv v24, v8, v16 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v8, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vredsum.vs v8, v24, v8 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwadd.vv v24, v8, v16 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vredsum.vs v8, v24, v8 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <32 x i32>, <32 x i32>* %x + %e = sext <32 x i32> %v to <32 x i64> + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v32i64(<32 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwaddu.vv v24, v8, v16 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v8, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vredsum.vs v8, v24, v8 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwaddu.vv v24, v8, v16 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vredsum.vs v8, v24, v8 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <32 x i32>, <32 x i32>* %x + %e = zext <32 x i32> %v to <32 x i64> + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) define i64 @vreduce_add_v64i64(<64 x i64>* %x) nounwind { @@ -638,6 +1600,284 @@ ret i64 %red } +define i64 @vwreduce_add_v64i64(<64 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v64i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwadd.vv v0, v24, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwadd.vv v0, v8, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v64i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwadd.vv v0, v24, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwadd.vv v0, v8, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: add sp, sp, a1 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + %e = sext <64 x i32> %v to <64 x i64> + %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v64i64(<64 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v64i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwaddu.vv v0, v24, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwaddu.vv v0, v8, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v64i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwaddu.vv v0, v24, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwaddu.vv v0, v8, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: add sp, sp, a1 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + %e = zext <64 x i32> %v to <64 x i64> + %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) + ret i64 %red +} + declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>) define i8 @vreduce_and_v1i8(<1 x i8>* %x) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll @@ -382,6 +382,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv1i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv1i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv1i16() define signext i16 @vreduce_umax_nxv1i16( %v) { @@ -505,6 +535,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv2i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv2i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv2i16() define signext i16 @vreduce_umax_nxv2i16( %v) { @@ -628,6 +688,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv4i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv4i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv4i16() define signext i16 @vreduce_umax_nxv4i16( %v) { @@ -751,6 +841,36 @@ ret i32 %red } +define i32 @vwreduce_add_nxv1i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_nxv1i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv1i32() define i32 @vreduce_umax_nxv1i32( %v) { @@ -874,6 +994,36 @@ ret i32 %red } +define i32 @vwreduce_add_nxv2i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_nxv2i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv2i32() define i32 @vreduce_umax_nxv2i32( %v) { @@ -997,6 +1147,36 @@ ret i32 %red } +define i32 @vwreduce_add_nxv4i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) + ret i32 %red +} + +define i32 @vwreduce_uadd_nxv4i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv4i32() define i32 @vreduce_umax_nxv4i32( %v) { @@ -1124,6 +1304,44 @@ ret i64 %red } +define i64 @vwreduce_add_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv1i64() define i64 @vreduce_umax_nxv1i64( %v) { @@ -1292,6 +1510,44 @@ ret i64 %red } +define i64 @vwreduce_add_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv2i64() define i64 @vreduce_umax_nxv2i64( %v) { @@ -1460,6 +1716,44 @@ ret i64 %red } +define i64 @vwreduce_add_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv4i64() define i64 @vreduce_umax_nxv4i64( %v) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll @@ -382,6 +382,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv1i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv1i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv1i16() define signext i16 @vreduce_umax_nxv1i16( %v) { @@ -505,6 +535,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv2i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv2i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv2i16() define signext i16 @vreduce_umax_nxv2i16( %v) { @@ -628,6 +688,36 @@ ret i16 %red } +define signext i16 @vwreduce_add_nxv4i8( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) + ret i16 %red +} + +define signext i16 @vwreduce_uadd_nxv4i8( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) + ret i16 %red +} + declare i16 @llvm.vector.reduce.umax.nxv4i16() define signext i16 @vreduce_umax_nxv4i16( %v) { @@ -751,6 +841,36 @@ ret i32 %red } +define signext i32 @vwreduce_add_nxv1i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) + ret i32 %red +} + +define signext i32 @vwreduce_uadd_nxv1i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv1i32() define signext i32 @vreduce_umax_nxv1i32( %v) { @@ -874,6 +994,36 @@ ret i32 %red } +define signext i32 @vwreduce_add_nxv2i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) + ret i32 %red +} + +define signext i32 @vwreduce_uadd_nxv2i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv2i32() define signext i32 @vreduce_umax_nxv2i32( %v) { @@ -997,6 +1147,36 @@ ret i32 %red } +define signext i32 @vwreduce_add_nxv4i16( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) + ret i32 %red +} + +define signext i32 @vwreduce_uadd_nxv4i16( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) + ret i32 %red +} + declare i32 @llvm.vector.reduce.umax.nxv4i32() define signext i32 @vreduce_umax_nxv4i32( %v) { @@ -1120,6 +1300,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv1i64() define i64 @vreduce_umax_nxv1i64( %v) { @@ -1244,6 +1454,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv2i64() define i64 @vreduce_umax_nxv2i64( %v) { @@ -1368,6 +1608,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv4i64() define i64 @vreduce_umax_nxv4i64( %v) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -1353,6 +1353,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1625,6 +1695,76 @@ ret i64 %r } +define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_add_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_add_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_uadd_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_uadd_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1897,6 +2037,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = zext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) {