diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -643,6 +643,46 @@ defm : VPatBinarySDNodeExt_V_WX; } +multiclass VPatWidenReductionVL { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + defvar wti_m1 = !cast("VI"#wti.SEW#"M1"); + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask true_mask), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX) + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask V0), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + +multiclass VPatWidenReductionVL_Ext_VL { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + defvar wti_m1 = !cast("VI"#wti.SEW#"M1"); + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)), + VR:$rs2, (vti.Mask true_mask), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX) + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)), + VR:$rs2, (vti.Mask V0), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// @@ -983,6 +1023,13 @@ defm : VPatReductionVL; defm : VPatReductionVL; defm : VPatReductionVL; + +// 15.2. Vector Widening Integer Reduction Instructions +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL_Ext_VL; +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL_Ext_VL; } // Predicates = [HasVInstructions] // 15.3. Vector Single-Width Floating-Point Reduction Instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -440,6 +440,60 @@ ret i64 %red } +define i64 @vwreduce_add_v1i64(<1 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v9, v8 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsrl.vx v8, v9, a0 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %v = load <1 x i32>, <1 x i32>* %x + %e = sext <1 x i32> %v to <1 x i64> + %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v1i64(<1 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsrl.vx v8, v9, a0 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %v = load <1 x i32>, <1 x i32>* %x + %e = zext <1 x i32> %v to <1 x i64> + %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) define i64 @vreduce_add_v2i64(<2 x i64>* %x) { @@ -469,6 +523,74 @@ ret i64 %red } +define i64 @vwreduce_add_v2i64(<2 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <2 x i32>, <2 x i32>* %x + %e = sext <2 x i32> %v to <2 x i64> + %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v2i64(<2 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <2 x i32>, <2 x i32>* %x + %e = zext <2 x i32> %v to <2 x i64> + %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) define i64 @vreduce_add_v4i64(<4 x i64>* %x) { @@ -498,6 +620,74 @@ ret i64 %red } +define i64 @vwreduce_add_v4i64(<4 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %x + %e = sext <4 x i32> %v to <4 x i64> + %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v4i64(<4 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %x + %e = zext <4 x i32> %v to <4 x i64> + %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) define i64 @vreduce_add_v8i64(<8 x i64>* %x) { @@ -527,6 +717,74 @@ ret i64 %red } +define i64 @vwreduce_add_v8i64(<8 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v10 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v10 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %x + %e = sext <8 x i32> %v to <8 x i64> + %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v8i64(<8 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v10 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v10 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %x + %e = zext <8 x i32> %v to <8 x i64> + %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) define i64 @vreduce_add_v16i64(<16 x i64>* %x) { @@ -556,6 +814,74 @@ ret i64 %red } +define i64 @vwreduce_add_v16i64(<16 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwredsum.vs v8, v8, v12 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwredsum.vs v8, v8, v12 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <16 x i32>, <16 x i32>* %x + %e = sext <16 x i32> %v to <16 x i64> + %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v16i64(<16 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vwredsumu.vs v8, v8, v12 +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV64-NEXT: vwredsumu.vs v8, v8, v12 +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <16 x i32>, <16 x i32>* %x + %e = zext <16 x i32> %v to <16 x i64> + %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) define i64 @vreduce_add_v32i64(<32 x i64>* %x) { @@ -591,6 +917,88 @@ ret i64 %red } +define i64 @vwreduce_add_v32i64(<32 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: vadd.vv v8, v16, v24 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vadd.vv v8, v16, v24 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <32 x i32>, <32 x i32>* %x + %e = sext <32 x i32> %v to <32 x i64> + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v32i64(<32 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vzext.vf2 v16, v8 +; RV32-NEXT: vadd.vv v8, v16, v24 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v24, v16 +; RV64-NEXT: vzext.vf2 v16, v8 +; RV64-NEXT: vadd.vv v8, v16, v24 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret + %v = load <32 x i32>, <32 x i32>* %x + %e = zext <32 x i32> %v to <32 x i64> + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) define i64 @vreduce_add_v64i64(<64 x i64>* %x) nounwind { @@ -638,6 +1046,180 @@ ret i64 %red } +define i64 @vwreduce_add_v64i64(<64 x i32>* %x) { +; RV32-LABEL: vwreduce_add_v64i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v0, (a0) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf2 v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf2 v8, v0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v24, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsext.vf2 v0, v24 +; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_add_v64i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV64-NEXT: vle32.v v0, (a0) +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsext.vf2 v16, v24 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v8, v0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v24, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v0, v24 +; RV64-NEXT: vadd.vv v8, v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add sp, sp, a1 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + %e = sext <64 x i32> %v to <64 x i64> + %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_v64i64(<64 x i32>* %x) { +; RV32-LABEL: vwreduce_uadd_v64i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v0, (a0) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf2 v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf2 v8, v0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v24, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vzext.vf2 v0, v24 +; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwreduce_uadd_v64i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV64-NEXT: vle32.v v0, (a0) +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vzext.vf2 v16, v24 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v8, v0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v24, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vzext.vf2 v0, v24 +; RV64-NEXT: vadd.vv v8, v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add sp, sp, a1 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + %e = zext <64 x i32> %v to <64 x i64> + %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) + ret i64 %red +} + declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>) define i8 @vreduce_and_v1i8(<1 x i8>* %x) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll @@ -1120,6 +1120,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv1i64() define i64 @vreduce_umax_nxv1i64( %v) { @@ -1244,6 +1274,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv2i64() define i64 @vreduce_umax_nxv2i64( %v) { @@ -1368,6 +1428,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv4i64() define i64 @vreduce_umax_nxv4i64( %v) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -1353,6 +1353,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1625,6 +1695,76 @@ ret i64 %r } +define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_add_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_add_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_uadd_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_uadd_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1897,6 +2037,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = zext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) {