diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10094,14 +10094,28 @@
       if (!Subtarget.is64Bit() || N->getValueType(0) != MVT::i32)
         return;
 
+      // Extend inputs to XLen, and shift by 32. This will add 64 trailing zeros
+      // to the full 128-bit clmul result of multiplying two xlen values.
+      // Perform clmulr or clmulh on the shifted values. Finally, extract the
+      // upper 32 bits.
+      //
+      // The alternative is to mask the inputs to 32 bits and use clmul, but
+      // that requires two shifts to mask each input without zext.w.
+      // FIXME: If the inputs are known zero extended or could be freely
+      // zero extended, the mask form would be better.
       SDValue NewOp0 =
-          DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1));
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
       SDValue NewOp1 =
-          DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(2));
-      SDValue Res = DAG.getNode(RISCVISD::CLMUL, DL, MVT::i64, NewOp0, NewOp1);
-      unsigned ShAmt = IntNo == Intrinsic::riscv_clmulh ? 32 : 31;
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+      NewOp0 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp0,
+                           DAG.getConstant(32, DL, MVT::i64));
+      NewOp1 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp1,
+                           DAG.getConstant(32, DL, MVT::i64));
+      unsigned Opc = IntNo == Intrinsic::riscv_clmulh ? RISCVISD::CLMULH
+                                                      : RISCVISD::CLMULR;
+      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1);
       Res = DAG.getNode(ISD::SRL, DL, MVT::i64, Res,
-                        DAG.getConstant(ShAmt, DL, MVT::i64));
+                        DAG.getConstant(32, DL, MVT::i64));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll
--- a/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll
@@ -19,12 +19,23 @@
 ; RV64ZBC-LABEL: clmul32r:
 ; RV64ZBC:       # %bb.0:
 ; RV64ZBC-NEXT:    slli a1, a1, 32
-; RV64ZBC-NEXT:    srli a1, a1, 32
 ; RV64ZBC-NEXT:    slli a0, a0, 32
-; RV64ZBC-NEXT:    srli a0, a0, 32
-; RV64ZBC-NEXT:    clmul a0, a0, a1
-; RV64ZBC-NEXT:    srli a0, a0, 31
-; RV64ZBC-NEXT:    sext.w a0, a0
+; RV64ZBC-NEXT:    clmulr a0, a0, a1
+; RV64ZBC-NEXT:    srai a0, a0, 32
+; RV64ZBC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
+; FIXME: We could avoid the slli instructions by using clmul+srli+sext.w since
+; the inputs are zero extended.
+define signext i32 @clmul32r_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64ZBC-LABEL: clmul32r_zext:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    slli a1, a1, 32
+; RV64ZBC-NEXT:    slli a0, a0, 32
+; RV64ZBC-NEXT:    clmulr a0, a0, a1
+; RV64ZBC-NEXT:    srai a0, a0, 32
 ; RV64ZBC-NEXT:    ret
   %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b)
   ret i32 %tmp
diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
--- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
@@ -44,13 +44,24 @@
 ; RV64ZBC-ZBKC-LABEL: clmul32h:
 ; RV64ZBC-ZBKC:       # %bb.0:
 ; RV64ZBC-ZBKC-NEXT:    slli a1, a1, 32
-; RV64ZBC-ZBKC-NEXT:    srli a1, a1, 32
 ; RV64ZBC-ZBKC-NEXT:    slli a0, a0, 32
-; RV64ZBC-ZBKC-NEXT:    srli a0, a0, 32
-; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
 ; RV64ZBC-ZBKC-NEXT:    srai a0, a0, 32
 ; RV64ZBC-ZBKC-NEXT:    ret
   %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
   ret i32 %tmp
 }
 
+; FIXME: We could avoid the slli instructions by using clmul+srai since the
+; inputs are zero extended.
+define signext i32 @clmul32h_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul32h_zext:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    slli a1, a1, 32
+; RV64ZBC-ZBKC-NEXT:    slli a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    srai a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}