diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -242,6 +242,11 @@ TargetTransformInfo::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const { using VPLegalization = TargetTransformInfo::VPLegalization; + if (PI.getIntrinsicID() == Intrinsic::vp_reduce_mul && + cast(PI.getArgOperand(1)->getType()) + ->getElementType() + ->getIntegerBitWidth() != 1) + return VPLegalization(VPLegalization::Discard, VPLegalization::Convert); return VPLegalization(VPLegalization::Legal, VPLegalization::Legal); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 declare i1 @llvm.vp.reduce.and.v1i1(i1, <1 x i1>, <1 x i1>, i32) @@ -1006,3 +1006,465 @@ %r = call i1 @llvm.vp.reduce.mul.v64i1(i1 %s, <64 x i1> %v, <64 x i1> %m, i32 %evl) ret i1 %r } + +declare i8 @llvm.vp.reduce.mul.v1i8(i8, <1 x i8>, <1 x i1>, i32) + +define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v1i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: vmsne.vi v9, v9, 0 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v1i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmsne.vi v9, v9, 0 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v2i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vid.v v9 +; RV32-NEXT: vmsltu.vx v9, v9, a1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vrgather.vi v9, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v2i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vid.v v9 +; RV64-NEXT: vmsltu.vx v9, v9, a1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vrgather.vi v9, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vid.v v9 +; RV32-NEXT: vmsltu.vx v9, v9, a1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vrgather.vi v9, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vid.v v9 +; RV64-NEXT: vmsltu.vx v9, v9, a1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vrgather.vi v9, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v8i8(i8, <8 x i8>, <8 x i1>, i32) + +define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmsltu.vx v9, v10, a1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vrgather.vi v9, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vid.v v10 +; RV64-NEXT: vmsltu.vx v9, v10, a1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vslidedown.vi v9, v8, 4 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vrgather.vi v9, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v8i8(i8 %s, <8 x i8> %v, <8 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v16i8(i8, <16 x i8>, <16 x i1>, i32) + +define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v16i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: vmsltu.vx v9, v12, a1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v9, 1 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vslidedown.vi v9, v8, 8 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vrgather.vi v9, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v9 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v16i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64-NEXT: vid.v v12 +; RV64-NEXT: vmsltu.vx v9, v12, a1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-NEXT: vmv.v.i v9, 1 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vslidedown.vi v9, v8, 8 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 4 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vrgather.vi v9, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v9 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v16i8(i8 %s, <16 x i8> %v, <16 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v32i8(i8, <32 x i8>, <32 x i1>, i32) + +define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v32i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vid.v v16 +; RV32-NEXT: vmsltu.vx v10, v16, a1 +; RV32-NEXT: vmand.mm v0, v10, v0 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 1 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: vslidedown.vi v10, v8, 16 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 8 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 4 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vrgather.vi v10, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v32i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vid.v v16 +; RV64-NEXT: vmsltu.vx v10, v16, a1 +; RV64-NEXT: vmand.mm v0, v10, v0 +; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 1 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: vslidedown.vi v10, v8, 16 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 8 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 4 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vrgather.vi v10, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v32i8(i8 %s, <32 x i8> %v, <32 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.mul.v64i8(i8, <64 x i8>, <64 x i1>, i32) + +define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_mul_v64i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: lui a2, %hi(.LCPI63_0) +; RV32-NEXT: addi a2, a2, %lo(.LCPI63_0) +; RV32-NEXT: vle32.v v16, (a2) +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: vid.v v24 +; RV32-NEXT: vmsltu.vx v12, v24, a1 +; RV32-NEXT: vmsltu.vx v13, v16, a1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; RV32-NEXT: vslideup.vi v12, v13, 4 +; RV32-NEXT: li a0, 64 +; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: vmand.mm v0, v12, v0 +; RV32-NEXT: vmv.v.i v12, 1 +; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV32-NEXT: vslidedown.vx v12, v8, a3 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vslidedown.vi v12, v8, 16 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vslidedown.vi v12, v8, 8 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vslidedown.vi v12, v8, 4 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vrgather.vi v12, v8, 1 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srai a0, a0, 24 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_mul_v64i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64-NEXT: lui a2, %hi(.LCPI63_0) +; RV64-NEXT: addi a2, a2, %lo(.LCPI63_0) +; RV64-NEXT: vle32.v v16, (a2) +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vid.v v24 +; RV64-NEXT: vmsltu.vx v12, v24, a1 +; RV64-NEXT: vmsltu.vx v13, v16, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; RV64-NEXT: vslideup.vi v12, v13, 4 +; RV64-NEXT: li a0, 64 +; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: vmand.mm v0, v12, v0 +; RV64-NEXT: vmv.v.i v12, 1 +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV64-NEXT: vslidedown.vx v12, v8, a3 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vslidedown.vi v12, v8, 8 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vslidedown.vi v12, v8, 4 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vrgather.vi v12, v8, 1 +; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a0, a0, 56 +; RV64-NEXT: srai a0, a0, 56 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %r = call i8 @llvm.vp.reduce.mul.v64i8(i8 %s, <64 x i8> %v, <64 x i1> %m, i32 %evl) + ret i8 %r +}