diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7562,8 +7562,34 @@ return SDValue(); } +// This function return true if OP is equivalent to zero-extend operation. +// FixMe: There are more operations that need to be improved here. +static bool isZeroExtOp(SDValue &Op, int ScalarBits, int NarrowSize, + SelectionDAG &DAG) { + // ZEXTLoad is a zero-extend operation. + if (ISD::isZEXTLoad(Op.getNode())) { + APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize); + if (!DAG.MaskedValueIsZero(Op, Mask)) + return false; + return true; + } + + // ISD::AND may be a zero-extend operation. + if (Op.getOpcode() == ISD::AND) { + if (auto *AndRHS = dyn_cast(Op.getOperand(1))) { + int Val = (2 << (NarrowSize - 1)) - 1; + if (AndRHS->getAPIntValue() == Val) { + if (Op.hasOneUse()) + Op = Op.getOperand(0); + return true; + } + } + } + + return false; +} + // Try to form VWMUL, VWMULU or VWMULSU. -// TODO: Support VWMULSU.vx with a sign extend Op and a splat of scalar Op. static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, bool Commute) { assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode"); @@ -7623,7 +7649,9 @@ if (ScalarBits < EltBits) return SDValue(); - if (IsSignExt) { + if (IsSignExt && isZeroExtOp(Op1, ScalarBits, NarrowSize, DAG)) { + IsVWMULSU = true; + } else if (IsSignExt) { if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize)) return SDValue(); } else { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwmulsu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwmulsu_v2i16: @@ -681,3 +681,233 @@ %f = mul <16 x i64> %d, %e ret <16 x i64> %f } + +define <8 x i16> @vwmulsu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwmulsu_vx_v8i16_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i16 + %d = insertelement <8 x i16> poison, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer + %f = sext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(<8 x i8>* %x, i8* %y) { +; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = load i8, i8* %y + %c = sext i8 %b to i16 + %d = insertelement <8 x i16> poison, i16 %c, i32 0 + %e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer + %f = zext <8 x i8> %a to <8 x i16> + %g = mul <8 x i16> %e, %f + ret <8 x i16> %g +} + +define <4 x i32> @vwmulsu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) { +; CHECK-LABEL: vwmulsu_vx_v4i32_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i32 + %d = insertelement <4 x i32> poison, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <4 x i32> @vwmulsu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) { +; CHECK-LABEL: vwmulsu_vx_v4i32_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lhu a0, 0(a1) +; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i32 + %d = insertelement <4 x i32> poison, i32 %c, i32 0 + %e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer + %f = sext <4 x i16> %a to <4 x i32> + %g = mul <4 x i32> %e, %f + ret <4 x i32> %g +} + +define <2 x i64> @vwmulsu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) { +; RV32-LABEL: vwmulsu_vx_v2i64_i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i64_i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vwmulsu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i8, i8* %y + %c = zext i8 %b to i64 + %d = insertelement <2 x i64> poison, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmulsu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) { +; RV32-LABEL: vwmulsu_vx_v2i64_i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lhu a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i64_i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: vwmulsu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i16, i16* %y + %c = zext i16 %b to i64 + %d = insertelement <2 x i64> poison, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <2 x i64> @vwmulsu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) { +; RV32-LABEL: vwmulsu_vx_v2i64_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i64_i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: vwmulsu.vx v8, v9, a0 +; RV64-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %x + %b = load i32, i32* %y + %c = zext i32 %b to i64 + %d = insertelement <2 x i64> poison, i64 %c, i64 0 + %e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer + %f = sext <2 x i32> %a to <2 x i64> + %g = mul <2 x i64> %e, %f + ret <2 x i64> %g +} + +define <8 x i16> @vwmulsu_vx_v8i16_i8_and(<8 x i8>* %x, i16 %y) { +; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = and i16 %y, 255 + %c = insertelement <8 x i16> poison, i16 %b, i32 0 + %d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer + %e = sext <8 x i8> %a to <8 x i16> + %f = mul <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <8 x i16> @vwmulsu_vx_v8i16_i8_and1(<8 x i8>* %x, i16 %y) { +; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: andi a0, a1, 254 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 +; CHECK-NEXT: ret + %a = load <8 x i8>, <8 x i8>* %x + %b = and i16 %y, 254 + %c = insertelement <8 x i16> poison, i16 %b, i32 0 + %d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer + %e = sext <8 x i8> %a to <8 x i16> + %f = mul <8 x i16> %d, %e + ret <8 x i16> %f +} + +define <4 x i32> @vwmulsu_vx_v4i32_i16_and(<4 x i16>* %x, i32 %y) { +; CHECK-LABEL: vwmulsu_vx_v4i32_i16_and: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %x + %b = and i32 %y, 65535 + %c = insertelement <4 x i32> poison, i32 %b, i32 0 + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer + %e = sext <4 x i16> %a to <4 x i32> + %f = mul <4 x i32> %d, %e + ret <4 x i32> %f +}