Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -8385,6 +8385,40 @@ = frem float 4.0, %var ; yields float:result = 4.0 % %var +'``llvm.experimental.clmul``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + define <16 x i8> @llvm.experimental.clmul.v16i8(<16 x i8> , <16 x i8> ) + +Overview: +""""""""" + +This is an overloaded intrinsic. You can use '``llvm.experimental.clmul.*``' on +any integer bit width and for different address spaces. Not all targets support +all bit widths however. + +Arguments: +"""""""""" + +The two arguments to the '``llvm.experimental.clmul.*``' intrinsic must be +:ref:`integer ` or :ref:`vector ` of integer values. Both +arguments must have identical types. + +Semantics: +"""""""""" + +This is not the same as multiplication. It is as if multiplication was done, +but with no carrying of overflow values. If the operands are polynomials in a +Glois field of 2 elements, then this is equilivent to multiplication. Thus, +this operation is also known as polynomial multiplication. + +There is no guarantee that execution will happen in constant time. + .. _bitwiseops: Bitwise Binary Operations Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -588,6 +588,9 @@ CTPOP, BITREVERSE, + // Carry-less multiplication. + CLMUL, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -863,6 +863,10 @@ def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>; } +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] in { + def int_experimental_clmul : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; +} + //===------------------------ Debugger Intrinsics -------------------------===// // Index: llvm/include/llvm/Support/TargetOpcodes.def =================================================================== --- llvm/include/llvm/Support/TargetOpcodes.def +++ llvm/include/llvm/Support/TargetOpcodes.def @@ -607,6 +607,9 @@ /// Generic bit reverse. HANDLE_TARGET_OPCODE(G_BITREVERSE) +/// Carry-less multiplication. +HANDLE_TARGET_OPCODE(G_CLMUL) + /// Floating point ceil. HANDLE_TARGET_OPCODE(G_FCEIL) Index: llvm/include/llvm/Target/GenericOpcodes.td =================================================================== --- llvm/include/llvm/Target/GenericOpcodes.td +++ llvm/include/llvm/Target/GenericOpcodes.td @@ -183,6 +183,12 @@ let hasSideEffects = 0; } +def G_CLMUL : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + def G_BSWAP : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src); Index: llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -100,6 +100,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -418,6 +418,7 @@ def ctpop : SDNode<"ISD::CTPOP" , SDTIntBitCountUnaryOp>; def ctlz_zero_undef : SDNode<"ISD::CTLZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>; def cttz_zero_undef : SDNode<"ISD::CTTZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>; +def clmul : SDNode<"ISD::CLMUL" , SDTIntBinOp, [SDNPCommutative]>; def sext : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>; def zext : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>; def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>; Index: llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1232,6 +1232,8 @@ return TargetOpcode::G_FCOS; case Intrinsic::ctpop: return TargetOpcode::G_CTPOP; + case Intrinsic::experimental_clmul: + return TargetOpcode::G_CLMUL; case Intrinsic::exp: return TargetOpcode::G_FEXP; case Intrinsic::exp2: Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -131,6 +131,7 @@ case ISD::XOR: case ISD::ADD: case ISD::SUB: + case ISD::CLMUL: case ISD::MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::SDIV: Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6266,6 +6266,13 @@ setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg)); return; } + case Intrinsic::experimental_clmul: { + SDValue Arg0 = getValue(I.getArgOperand(0)); + EVT Ty = Arg0.getValueType(); + SDValue Arg1 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Ty, {Arg0, Arg1})); + return; + } case Intrinsic::fshl: case Intrinsic::fshr: { bool IsFSHL = Intrinsic == Intrinsic::fshl; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -411,6 +411,9 @@ case ISD::CTLZ: return "ctlz"; case ISD::CTLZ_ZERO_UNDEF: return "ctlz_zero_undef"; + case ISD::CLMUL: + return "clmul"; + // Trampolines case ISD::INIT_TRAMPOLINE: return "init_trampoline"; case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline"; Index: llvm/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/lib/IR/AutoUpgrade.cpp +++ llvm/lib/IR/AutoUpgrade.cpp @@ -601,6 +601,18 @@ return true; } } + // The last . here is important, as pmull is a differen't instruction + if (Name.startswith("aarch64.neon.pmul.")) { + if (F->arg_size() != 2) + break; // Invalid IR. + VectorType *Ty = dyn_cast(F->getReturnType()); + if (Ty && Ty->getElementType()->isIntegerTy() && + Ty->getElementType()->getIntegerBitWidth() == 8) + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::experimental_clmul, F->getReturnType()); + + return true; + } break; } @@ -3589,6 +3601,13 @@ break; } + case Intrinsic::experimental_clmul: { + SmallVector Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + NewCall = Builder.CreateCall(NewFn, Args); + break; + } + case Intrinsic::bitreverse: NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)}); break; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -402,6 +402,11 @@ setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); + if (Subtarget->hasNEON()) { + setOperationAction(ISD::CLMUL, MVT::v16i8, Legal); + setOperationAction(ISD::CLMUL, MVT::v8i8, Legal); + } + // Custom lower Add/Sub/Mul with overflow. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4061,6 +4061,11 @@ defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; +def : Pat<(v16i8 (clmul V128:$Rn, V128:$Rm)), + (PMULv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v8i8 (clmul V64:$Rn, V64:$Rm)), + (PMULv8i8 V64:$Rn, V64:$Rm)>; + defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >; defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; Index: llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -492,6 +492,9 @@ # DEBUG-NEXT: G_BITREVERSE (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_CLMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected Index: llvm/test/CodeGen/AArch64/clmul.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/clmul.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s + +declare <16 x i8> @llvm.experimental.clmul.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @clmul_v16i8(<16 x i8> %l, <16 x i8> %r) { +; CHECK-LABEL: clmul_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pmul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %ret = call <16 x i8> @llvm.experimental.clmul.v16i8(<16 x i8> %l, <16 x i8> %r) + ret <16 x i8> %ret +} + +declare <8 x i8> @llvm.experimental.clmul.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @clmul_v8i8(<8 x i8> %l, <8 x i8> %r) { +; CHECK-LABEL: clmul_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pmul v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %ret = call <8 x i8> @llvm.experimental.clmul.v8i8(<8 x i8> %l, <8 x i8> %r) + ret <8 x i8> %ret +}