diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7786,9 +7786,11 @@
     if (ScalarBits < EltBits)
       return SDValue();
 
+    // If the LHS is a sign extend, try to use vwmul.
     if (IsSignExt && DAG.ComputeNumSignBits(Op1) > (ScalarBits - NarrowSize)) {
       // Can use vwmul.
     } else {
+      // Otherwise try to use vwmulu or vwmulsu.
       APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
       if (DAG.MaskedValueIsZero(Op1, Mask))
         IsVWMULSU = IsSignExt;
@@ -8438,6 +8440,16 @@
       return Gather;
     break;
   }
+  case RISCVISD::VMV_V_X_VL: {
+    // VMV.V.X only demands the vector element bitwidth from the scalar input.
+    unsigned ScalarSize = N->getOperand(0).getValueSizeInBits();
+    unsigned EltWidth = N->getValueType(0).getScalarSizeInBits();
+    if (ScalarSize > EltWidth)
+      if (SimplifyDemandedLowBitsHelper(0, EltWidth))
+        return SDValue(N, 0);
+
+    break;
+  }
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 define <2 x i16> @vwmulsu_v2i16(<2 x i8>* %x, <2 x i8>* %y) {
 ; CHECK-LABEL: vwmulsu_v2i16:
@@ -726,7 +726,7 @@
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    lbu a0, 0(a1)
-; CHECK-NEXT:    vwmulsu.vx v8, v9, a0
+; CHECK-NEXT:    vwmul.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, <4 x i16>* %x
   %b = load i8, i8* %y
@@ -779,7 +779,7 @@
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; RV64-NEXT:    vle32.v v9, (a0)
 ; RV64-NEXT:    lbu a0, 0(a1)
-; RV64-NEXT:    vwmulsu.vx v8, v9, a0
+; RV64-NEXT:    vwmul.vx v8, v9, a0
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, <2 x i32>* %x
   %b = load i8, i8* %y
@@ -814,7 +814,7 @@
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; RV64-NEXT:    vle32.v v9, (a0)
 ; RV64-NEXT:    lhu a0, 0(a1)
-; RV64-NEXT:    vwmulsu.vx v8, v9, a0
+; RV64-NEXT:    vwmul.vx v8, v9, a0
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, <2 x i32>* %x
   %b = load i16, i16* %y
@@ -881,11 +881,9 @@
 ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    andi a0, a1, 254
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vsext.vf2 v9, v8
-; CHECK-NEXT:    vmul.vx v8, v9, a0
+; CHECK-NEXT:    vwmulsu.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %a = load <8 x i8>, <8 x i8>* %x
   %b = and i16 %y, 254
@@ -911,3 +909,19 @@
   %f = mul <4 x i32> %d, %e
   ret <4 x i32> %f
 }
+
+define <4 x i32> @vwmulsu_vx_v4i32_i16_zext(<4 x i16>* %x, i16 %y) {
+; CHECK-LABEL: vwmulsu_vx_v4i32_i16_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vwmulsu.vx v8, v9, a1
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, <4 x i16>* %x
+  %b = zext i16 %y to i32
+  %c = insertelement <4 x i32> poison, i32 %b, i32 0
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer
+  %e = sext <4 x i16> %a to <4 x i32>
+  %f = mul <4 x i32> %d, %e
+  ret <4 x i32> %f
+}