diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6576,6 +6576,87 @@ return SDValue(N, 0); } +// Try to form VWMUL or VWMULU. +// FIXME: Support VWMULSU. +static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1, + SelectionDAG &DAG) { + assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode"); + bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL; + bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL; + if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse()) + return SDValue(); + + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + + // Make sure the mask and VL match. + if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); + + // Determine the narrow size for a widening multiply. + unsigned NarrowSize = VT.getScalarSizeInBits() / 2; + MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize), + VT.getVectorElementCount()); + + SDLoc DL(N); + + // See if the other operand is the same opcode. + if (Op0.getOpcode() == Op1.getOpcode()) { + if (!Op1.hasOneUse()) + return SDValue(); + + // Make sure the mask and VL match. + if (Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL) + return SDValue(); + + Op1 = Op1.getOperand(0); + } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) { + // The operand is a splat of a scalar. + + // The VL must be the same. + if (Op1.getOperand(1) != VL) + return SDValue(); + + // Get the scalar value. + Op1 = Op1.getOperand(0); + + // See if have enough sign bits or zero bits in the scalar to use a + // widening multiply by splatting to smaller element size. + unsigned EltBits = VT.getScalarSizeInBits(); + unsigned ScalarBits = Op1.getValueSizeInBits(); + // Make sure we're getting all element bits from the scalar register. + // FIXME: Support implicit sign extension of vmv.v.x? + if (ScalarBits < EltBits) + return SDValue(); + + if (IsSignExt) { + if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize)) + return SDValue(); + } else { + APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize); + if (!DAG.MaskedValueIsZero(Op1, Mask)) + return SDValue(); + } + + Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL); + } else + return SDValue(); + + Op0 = Op0.getOperand(0); + + // Re-introduce narrower extends if needed. + unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL; + if (Op0.getValueType() != NarrowVT) + Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL); + if (Op1.getValueType() != NarrowVT) + Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL); + + unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; + return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7027,45 +7108,13 @@ break; } case RISCVISD::MUL_VL: { - // Try to form VWMUL or VWMULU. - // FIXME: Look for splat of extended scalar as well. - // FIXME: Support VWMULSU. SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); - bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL; - bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL; - if ((!IsSignExt && !IsZeroExt) || Op0.getOpcode() != Op1.getOpcode()) - return SDValue(); - - // Make sure the extends have a single use. - if (!Op0.hasOneUse() || !Op1.hasOneUse()) - return SDValue(); - - SDValue Mask = N->getOperand(2); - SDValue VL = N->getOperand(3); - if (Op0.getOperand(1) != Mask || Op1.getOperand(1) != Mask || - Op0.getOperand(2) != VL || Op1.getOperand(2) != VL) - return SDValue(); - - Op0 = Op0.getOperand(0); - Op1 = Op1.getOperand(0); - - MVT VT = N->getSimpleValueType(0); - MVT NarrowVT = - MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() / 2), - VT.getVectorElementCount()); - - SDLoc DL(N); - - // Re-introduce narrower extends if needed. - unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL; - if (Op0.getValueType() != NarrowVT) - Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL); - if (Op1.getValueType() != NarrowVT) - Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL); - - unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; - return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL); + if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG)) + return V; + if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG)) + return V; + return SDValue(); } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwmul_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwmul_v2i16: @@ -649,3 +649,239 @@ ret <16 x i64> %f } +define <8 x i16> @vwmul_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwmul_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwmul_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwmul_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vsext.vf2 v26, v25 +; CHECK-NEXT: vmul.vx v8, v26, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwmul_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwmul_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwmul_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwmul_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vwmul.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwmul_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwmul_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vsext.vf2 v26, v25 +; CHECK-NEXT: vmul.vx v8, v26, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwmul_vx_v2i64_i8(<2 x i32>* %x, i8* %y) { +; RV32-LABEL: vwmul_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lb a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lb a0, 0(a1) +; RV64-NEXT: vwmul.vx v8, v25, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmul_vx_v2i64_i16(<2 x i32>* %x, i16* %y) { +; RV32-LABEL: vwmul_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lh a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lh a0, 0(a1) +; RV64-NEXT: vwmul.vx v8, v25, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = sext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmul_vx_v2i64_i32(<2 x i32>* %x, i32* %y) { +; RV32-LABEL: vwmul_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lw a0, 0(a1) +; RV64-NEXT: vwmul.vx v8, v25, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = sext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmul_vx_v2i64_i64(<2 x i32>* %x, i64* %y) { +; RV32-LABEL: vwmul_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vsext.vf2 v26, v25 +; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -649,3 +649,246 @@ ret <16 x i64> %f } +define <8 x i16> @vwmulu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwmulu_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwmulu.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i16 + %d = insertelement <8 x i16> undef, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwmulu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { +; CHECK-LABEL: vwmulu_vx_v8i16_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vzext.vf2 v26, v25 +; CHECK-NEXT: vmul.vx v8, v26, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i16, i16* %y + %d = insertelement <8 x i16> undef, i16 %b, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwmulu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwmulu_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwmulu.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwmulu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwmulu_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lhu a0, 0(a1) +; CHECK-NEXT: vwmulu.vx v8, v25, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i32 + %d = insertelement <4 x i32> undef, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwmulu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { +; CHECK-LABEL: vwmulu_vx_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vzext.vf2 v26, v25 +; CHECK-NEXT: vmul.vx v8, v26, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i32, i32* %y + %d = insertelement <4 x i32> undef, i32 %b, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer + %f = zext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwmulu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) { +; RV32-LABEL: vwmulu_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lb a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulu_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lb a0, 0(a1) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v26, v25 +; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmulu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) { +; RV32-LABEL: vwmulu_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lh a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulu_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lh a0, 0(a1) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v26, v25 +; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmulu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) { +; RV32-LABEL: vwmulu_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulu_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lw a0, 0(a1) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v26, v25 +; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = zext i32 %b to i64 + %d = insertelement <2 x i64> undef, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmulu_vx_v2i64_i64(<2 x i32>* %x, i64* %y) { +; RV32-LABEL: vwmulu_vx_v2i64_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v26, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vzext.vf2 v27, v25 +; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulu_vx_v2i64_i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vzext.vf2 v26, v25 +; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i64, i64* %y + %d = insertelement <2 x i64> undef, i64 %b, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer + %f = zext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} +