diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -10094,14 +10094,28 @@ if (!Subtarget.is64Bit() || N->getValueType(0) != MVT::i32) return; + // Extend inputs to XLen, and shift by 32. This will add 64 trailing zeros + // to the full 128-bit clmul result of multiplying two xlen values. + // Perform clmulr or clmulh on the shifted values. Finally, extract the + // upper 32 bits. + // + // The alternative is to mask the inputs to 32 bits and use clmul, but + // that requires two shifts to mask each input without zext.w. + // FIXME: If the inputs are known zero extended or could be freely + // zero extended, the mask form would be better. SDValue NewOp0 = - DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1)); + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); SDValue NewOp1 = - DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(2)); - SDValue Res = DAG.getNode(RISCVISD::CLMUL, DL, MVT::i64, NewOp0, NewOp1); - unsigned ShAmt = IntNo == Intrinsic::riscv_clmulh ? 32 : 31; + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); + NewOp0 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp0, + DAG.getConstant(32, DL, MVT::i64)); + NewOp1 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp1, + DAG.getConstant(32, DL, MVT::i64)); + unsigned Opc = IntNo == Intrinsic::riscv_clmulh ? RISCVISD::CLMULH + : RISCVISD::CLMULR; + SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1); Res = DAG.getNode(ISD::SRL, DL, MVT::i64, Res, - DAG.getConstant(ShAmt, DL, MVT::i64)); + DAG.getConstant(32, DL, MVT::i64)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; } diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll --- a/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbc-intrinsic.ll @@ -19,12 +19,23 @@ ; RV64ZBC-LABEL: clmul32r: ; RV64ZBC: # %bb.0: ; RV64ZBC-NEXT: slli a1, a1, 32 -; RV64ZBC-NEXT: srli a1, a1, 32 ; RV64ZBC-NEXT: slli a0, a0, 32 -; RV64ZBC-NEXT: srli a0, a0, 32 -; RV64ZBC-NEXT: clmul a0, a0, a1 -; RV64ZBC-NEXT: srli a0, a0, 31 -; RV64ZBC-NEXT: sext.w a0, a0 +; RV64ZBC-NEXT: clmulr a0, a0, a1 +; RV64ZBC-NEXT: srai a0, a0, 32 +; RV64ZBC-NEXT: ret + %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b) + ret i32 %tmp +} + +; FIXME: We could avoid the slli instructions by using clmul+srli+sext.w since +; the inputs are zero extended. +define signext i32 @clmul32r_zext(i32 zeroext %a, i32 zeroext %b) nounwind { +; RV64ZBC-LABEL: clmul32r_zext: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: slli a1, a1, 32 +; RV64ZBC-NEXT: slli a0, a0, 32 +; RV64ZBC-NEXT: clmulr a0, a0, a1 +; RV64ZBC-NEXT: srai a0, a0, 32 ; RV64ZBC-NEXT: ret %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b) ret i32 %tmp diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll --- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll @@ -44,13 +44,24 @@ ; RV64ZBC-ZBKC-LABEL: clmul32h: ; RV64ZBC-ZBKC: # %bb.0: ; RV64ZBC-ZBKC-NEXT: slli a1, a1, 32 -; RV64ZBC-ZBKC-NEXT: srli a1, a1, 32 ; RV64ZBC-ZBKC-NEXT: slli a0, a0, 32 -; RV64ZBC-ZBKC-NEXT: srli a0, a0, 32 -; RV64ZBC-ZBKC-NEXT: clmul a0, a0, a1 +; RV64ZBC-ZBKC-NEXT: clmulh a0, a0, a1 ; RV64ZBC-ZBKC-NEXT: srai a0, a0, 32 ; RV64ZBC-ZBKC-NEXT: ret %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b) ret i32 %tmp } +; FIXME: We could avoid the slli instructions by using clmul+srai since the +; inputs are zero extended. +define signext i32 @clmul32h_zext(i32 zeroext %a, i32 zeroext %b) nounwind { +; RV64ZBC-ZBKC-LABEL: clmul32h_zext: +; RV64ZBC-ZBKC: # %bb.0: +; RV64ZBC-ZBKC-NEXT: slli a1, a1, 32 +; RV64ZBC-ZBKC-NEXT: slli a0, a0, 32 +; RV64ZBC-ZBKC-NEXT: clmulh a0, a0, a1 +; RV64ZBC-ZBKC-NEXT: srai a0, a0, 32 +; RV64ZBC-ZBKC-NEXT: ret + %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b) + ret i32 %tmp +}