diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -6144,8 +6144,20 @@ SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags); // Pad the vector with the neutral element. - unsigned OrigElts = OrigVT.getVectorNumElements(); - unsigned WideElts = WideVT.getVectorNumElements(); + unsigned OrigElts = OrigVT.getVectorMinNumElements(); + unsigned WideElts = WideVT.getVectorMinNumElements(); + + if (WideVT.isScalableVector()) { + unsigned GCD = greatestCommonDivisor(OrigElts, WideElts); + EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + ElementCount::getScalable(GCD)); + SDValue SplatNeutral = DAG.getSplatVector(SplatVT, dl, NeutralElem); + for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD) + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral, + DAG.getVectorIdxConstant(Idx, dl)); + return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags); + } + for (unsigned Idx = OrigElts; Idx < WideElts; Idx++) Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll @@ -29,6 +29,79 @@ ret half %res } +define half @fadda_nxv6f16( %v, half %s) { +; CHECK-LABEL: fadda_nxv6f16: +; CHECK: str x29, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ld1rh { z0.d }, p1/z, [x8] +; CHECK-NEXT: st1h { z0.d }, p1, [sp, #3, mul vl] +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [sp] +; CHECK-NEXT: fadda h0, p0, h0, z2.h +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 +; CHECK-NEXT: ret + %res = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) + ret half %res +} + +define half @fadda_nxv10f16( %v, half %s) { +; CHECK-LABEL: fadda_nxv10f16: +; CHECK: str x29, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ld1rh { z1.d }, p1/z, [x8] +; CHECK-NEXT: addvl x8, sp, #1 +; CHECK-NEXT: fadda h2, p0, h2, z0.h +; CHECK-NEXT: st1h { z1.d }, p1, [sp, #1, mul vl] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [sp] +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [sp, #6, mul vl] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [x8, #7, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: fadda h2, p0, h2, z1.h +; CHECK-NEXT: fmov s0, s2 +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 +; CHECK-NEXT: ret + %res = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, %v) + ret half %res +} + +define half @fadda_nxv12f16( %v, half %s) { +; CHECK-LABEL: fadda_nxv12f16: +; CHECK: adrp x8, .LCPI5_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ld1rh { z3.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fadda h2, p0, h2, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z3.h +; CHECK-NEXT: fadda h2, p0, h2, z1.h +; CHECK-NEXT: fmov s0, s2 +; CHECK-NEXT: ret + %res = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, %v) + ret half %res +} + define float @fadda_nxv2f32(float %init, %a) { ; CHECK-LABEL: fadda_nxv2f32: ; CHECK: ptrue p0.d @@ -233,6 +306,9 @@ declare half @llvm.vector.reduce.fadd.nxv2f16(half, ) declare half @llvm.vector.reduce.fadd.nxv4f16(half, ) declare half @llvm.vector.reduce.fadd.nxv8f16(half, ) +declare half @llvm.vector.reduce.fadd.nxv6f16(half, ) +declare half @llvm.vector.reduce.fadd.nxv10f16(half, ) +declare half @llvm.vector.reduce.fadd.nxv12f16(half, ) declare float @llvm.vector.reduce.fadd.nxv2f32(float, ) declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare half @llvm.vector.reduce.fadd.nxv1f16(half, ) @@ -1048,3 +1048,101 @@ %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red } + +; Test Widen VECREDUCE_SEQ_FADD +declare half @llvm.vector.reduce.fadd.nxv3f16(half, ) + +define half @vreduce_ord_fadd_nxv3f16( %v, half %s) { +; CHECK-LABEL: vreduce_ord_fadd_nxv3f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: fmv.h.x ft0, zero +; CHECK-NEXT: fneg.h ft0, ft0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v9, ft0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu +; CHECK-NEXT: vslideup.vx v8, v9, a1 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vfredosum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv3f16(half %s, %v) + ret half %red +} + +declare half @llvm.vector.reduce.fadd.nxv6f16(half, ) + +define half @vreduce_ord_fadd_nxv6f16( %v, half %s) { +; CHECK-LABEL: vreduce_ord_fadd_nxv6f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: fmv.h.x ft0, zero +; CHECK-NEXT: fneg.h ft0, ft0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v10, ft0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vslideup.vx v9, v10, a0 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vfredosum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) + ret half %red +} + +declare half @llvm.vector.reduce.fadd.nxv10f16(half, ) + +define half @vreduce_ord_fadd_nxv10f16( %v, half %s) { +; CHECK-LABEL: vreduce_ord_fadd_nxv10f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: fmv.h.x ft0, zero +; CHECK-NEXT: fneg.h ft0, ft0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v12, ft0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vslideup.vx v10, v12, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu +; CHECK-NEXT: vslideup.vi v11, v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vslideup.vx v11, v12, a0 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vfredosum.vs v8, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, %v) + ret half %red +} + +declare half @llvm.vector.reduce.fadd.nxv12f16(half, ) + +define half @vreduce_ord_fadd_nxv12f16( %v, half %s) { +; CHECK-LABEL: vreduce_ord_fadd_nxv12f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: fmv.h.x ft0, zero +; CHECK-NEXT: fneg.h ft0, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v11, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vfredosum.vs v8, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, %v) + ret half %red +}