diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -643,6 +643,26 @@ defm : VPatBinarySDNodeExt_V_WX; } +multiclass VPatWidenReductionVL { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + defvar wti_m1 = !cast("VI"#wti.SEW#"M1"); + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask true_mask), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX) + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge), + (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), + VR:$rs2, (vti.Mask V0), VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), + (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// @@ -983,6 +1003,11 @@ defm : VPatReductionVL; defm : VPatReductionVL; defm : VPatReductionVL; + +// 15.2. Vector Widening Integer Reduction Instructions +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL; +defm : VPatWidenReductionVL; } // Predicates = [HasVInstructions] // 15.3. Vector Single-Width Floating-Point Reduction Instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll @@ -1120,6 +1120,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv1i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv1i64() define i64 @vreduce_umax_nxv1i64( %v) { @@ -1244,6 +1274,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv2i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv2i64() define i64 @vreduce_umax_nxv2i64( %v) { @@ -1368,6 +1428,36 @@ ret i64 %red } +define i64 @vwreduce_add_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_add_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = sext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + +define i64 @vwreduce_uadd_nxv4i32( %v) { +; CHECK-LABEL: vwreduce_uadd_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret + %e = zext %v to + %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) + ret i64 %red +} + declare i64 @llvm.vector.reduce.umax.nxv4i64() define i64 @vreduce_umax_nxv4i64( %v) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -1353,6 +1353,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1625,6 +1695,76 @@ ret i64 %r } +define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_add_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_add_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vwpreduce_uadd_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwpreduce_uadd_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { @@ -1897,6 +2037,76 @@ ret i64 %r } +define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_add_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_add_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = sext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + +define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpwreduce_uadd_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpwreduce_uadd_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v10, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: ret + %e = zext %v to + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %e, %m, i32 %evl) + ret i64 %r +} + declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, , , i32) define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) {