Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -272,6 +272,12 @@ /// resulting value is this minimum value. SSUBSAT, USUBSAT, + /// RESULT = SMULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on + /// 2 integers with the same width and scale. SCALE represents the scale of + /// both operands as fixed point numbers. A scale of zero is effectively + /// performing saturation multiplication on 2 integers. + SMULFIX, + /// Simple binary floating point operators. FADD, FSUB, FMUL, FDIV, FREM, Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -797,6 +797,38 @@ return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } + /// Custom method defined by each target to indicate if an operation which + /// may require a saturation bit width is supported natively by the target. + /// If not, the operation is illegal. + virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT, + unsigned Scale) const { + return false; + } + + /// Some fixed point operations may be natively supported by the target but + /// only for specific scales. This method allows for checking + /// if the width is supported by the target for a given operation that may + /// depend on scale. + LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT, + unsigned Scale) const { + auto Action = getOperationAction(Op, VT); + if (Action != Legal) + return Action; + + // This operation is supported in this type but may only work on specific + // saturation widths. + bool Supported; + switch (Op) { + default: + llvm_unreachable("Unexpected fixed point operation."); + case ISD::SMULFIX: + Supported = isSupportedFixedPointOperation(Op, VT, Scale); + break; + } + + return Supported ? Action : Expand; + } + LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const { unsigned EqOpc; switch (Op) { @@ -3749,6 +3781,11 @@ SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node, SelectionDAG &DAG) const; + /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts + /// integers or vectors of integers as its arguments. + SDValue getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -749,6 +749,9 @@ def int_usub_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +def int_smul_fix : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; //===------------------------- Memory Use Markers -------------------------===// // Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -377,6 +377,7 @@ def uaddsat : SDNode<"ISD::UADDSAT" , SDTIntBinOp, [SDNPCommutative]>; def ssubsat : SDNode<"ISD::SSUBSAT" , SDTIntBinOp>; def usubsat : SDNode<"ISD::USUBSAT" , SDTIntBinOp>; +def smulfix : SDNode<"ISD::SMULFIX" , SDTIntBinOp, [SDNPCommutative]>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>; Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1128,6 +1128,12 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; } + case ISD::SMULFIX: { + unsigned Scale = cast(Node->getOperand(2))->getZExtValue(); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::MSCATTER: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node)->getValue().getValueType()); @@ -3269,6 +3275,10 @@ Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG)); break; } + case ISD::SMULFIX: { + Results.push_back(TLI.getExpandedFixedPointMultiplication(Node, DAG)); + break; + } case ISD::SADDO: case ISD::SSUBO: { SDValue LHS = Node->getOperand(0); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -145,6 +145,7 @@ case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: Res = PromoteIntRes_ADDSUBSAT(N); break; + case ISD::SMULFIX: Res = PromoteIntRes_SMULFIX(N); break; case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; @@ -616,6 +617,18 @@ return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } +SDValue DAGTypeLegalizer::PromoteIntRes_SMULFIX(SDNode *N) { + // Can just promote the operands then continue with operation. + SDLoc dl(N); + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue Op1Promoted = GetPromotedInteger(Op1); + SDValue Op2Promoted = GetPromotedInteger(Op2); + EVT PromotedType = Op1Promoted.getValueType(); + return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted, + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); @@ -1541,6 +1554,7 @@ case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break; + case ISD::SMULFIX: ExpandIntRes_SMULFIX(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -2509,6 +2523,33 @@ SplitInteger(Result, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + unsigned Scale = cast(N->getOperand(2))->getZExtValue(); + if (!Scale) { + SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + SplitInteger(Result, Lo, Hi); + return; + } + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue LL, LH, RL, RH; + GetExpandedInteger(LHS, LL, LH); + GetExpandedInteger(RHS, RL, RH); + if (!TLI.expandMUL( + DAG.getNode(ISD::MUL, dl, VT, LHS, RHS).getNode(), Lo, Hi, NVT, DAG, + TargetLowering::MulExpansionKind::OnlyLegalOrCustom, LL, LH, RL, RH)) + return; + + Lo = DAG.getNode(ISD::SRL, dl, NVT, Lo, DAG.getConstant(Scale, dl, NVT)); + Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi, + DAG.getConstant(NVT.getScalarSizeInBits() - Scale, dl, NVT)); +} + void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -331,6 +331,7 @@ SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_ADDSUBSAT(SDNode *N); + SDValue PromoteIntRes_SMULFIX(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -416,6 +417,7 @@ void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBSAT (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_SMULFIX (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -671,6 +673,8 @@ SDValue ScalarizeVecRes_UNDEF(SDNode *N); SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); + SDValue ScalarizeVecRes_SMULFIX(SDNode *N); + // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_BITCAST(SDNode *N); @@ -706,6 +710,8 @@ void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -403,6 +403,12 @@ case ISD::USUBSAT: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; + case ISD::SMULFIX: { + unsigned Scale = cast(Node->getOperand(2))->getZExtValue(); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::FP_ROUND_INREG: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node->getOperand(1))->getVT()); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -172,6 +172,9 @@ case ISD::STRICT_FTRUNC: R = ScalarizeVecRes_StrictFPOp(N); break; + case ISD::SMULFIX: + R = ScalarizeVecRes_SMULFIX(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -194,6 +197,14 @@ Op0.getValueType(), Op0, Op1, Op2); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_SMULFIX(SDNode *N) { + SDValue Op0 = GetScalarizedVector(N->getOperand(0)); + SDValue Op1 = GetScalarizedVector(N->getOperand(1)); + SDValue Op2 = N->getOperand(2); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1, + Op2); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) { EVT VT = N->getValueType(0).getVectorElementType(); unsigned NumOpers = N->getNumOperands(); @@ -848,6 +859,9 @@ case ISD::STRICT_FTRUNC: SplitVecRes_StrictFPOp(N, Lo, Hi); break; + case ISD::SMULFIX: + SplitVecRes_SMULFIX(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -885,6 +899,20 @@ Op0Hi, Op1Hi, Op2Hi); } +void DAGTypeLegalizer::SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue LHSLo, LHSHi; + GetSplitVector(N->getOperand(0), LHSLo, LHSHi); + SDValue RHSLo, RHSHi; + GetSplitVector(N->getOperand(1), RHSLo, RHSHi); + SDLoc dl(N); + SDValue Op2 = N->getOperand(2); + + unsigned Opcode = N->getOpcode(); + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2); +} + void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5814,6 +5814,14 @@ setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2)); return nullptr; } + case Intrinsic::smul_fix: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + SDValue Op3 = getValue(I.getArgOperand(2)); + setValue(&I, + DAG.getNode(ISD::SMULFIX, sdl, Op1.getValueType(), Op1, Op2, Op3)); + return nullptr; + } case Intrinsic::stacksave: { SDValue Op = getRoot(); Res = DAG.getNode( Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -295,6 +295,7 @@ case ISD::UADDSAT: return "uaddsat"; case ISD::SSUBSAT: return "ssubsat"; case ISD::USUBSAT: return "usubsat"; + case ISD::SMULFIX: return "smulfix"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5062,3 +5062,39 @@ return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff); } } + +SDValue +TargetLowering::getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const { + assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX."); + assert(Node->getNumOperands() == 3 && + "Expected signed fixed point multiplication to have 3 operands."); + + SDLoc dl(Node); + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + assert(LHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(RHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(LHS.getValueType() == RHS.getValueType() && + "Expected both operands to be the same type"); + + unsigned Scale = cast(Node->getOperand(2))->getZExtValue(); + EVT VT = LHS.getValueType(); + assert(Scale < VT.getScalarSizeInBits() && + "Expected scale to be less than the number of bits."); + + SDValue Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + if (Scale) { + SDValue Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS); + Lo = DAG.getNode(ISD::SRL, dl, VT, Lo, DAG.getConstant(Scale, dl, VT)); + Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, + DAG.getConstant(VT.getScalarSizeInBits() - Scale, dl, VT)); + return DAG.getNode(ISD::ADD, dl, VT, Lo, Hi); + } else { + return Lo; + } +} Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -614,6 +614,7 @@ setOperationAction(ISD::UADDSAT, VT, Expand); setOperationAction(ISD::SSUBSAT, VT, Expand); setOperationAction(ISD::USUBSAT, VT, Expand); + setOperationAction(ISD::SMULFIX, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4517,6 +4517,20 @@ "of ints"); break; } + case Intrinsic::smul_fix: { + Value *Op1 = CS.getArgOperand(0); + Value *Op2 = CS.getArgOperand(1); + Value *Op3 = CS.getArgOperand(2); + Assert(Op1->getType()->isIntOrIntVectorTy(), + "first operand of smul_fix must be an int type or vector " + "of ints"); + Assert(Op2->getType()->isIntOrIntVectorTy(), + "second operand of smul_fix must be an int type or vector " + "of ints"); + Assert(Op3->getType()->isIntegerTy(), + "third argumenr of smul_fix must be an int"); + break; + } }; } Index: llvm/test/CodeGen/X86/smul_fix.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/smul_fix.ll @@ -0,0 +1,329 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32 + +declare i4 @llvm.smul.fix.i4 (i4, i4, i32) +declare i32 @llvm.smul.fix.i32 (i32, i32, i32) +declare i64 @llvm.smul.fix.i64 (i64, i64, i32) +declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) + +define i32 @func(i32 %x, i32 %y) { +; CHECK-LABEL: func: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: imull %esi, %edi +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: shldl $30, %edi, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func: +; CHECK32: # %bb.0: +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl %eax, %ecx +; CHECK32-NEXT: imull %edx, %ecx +; CHECK32-NEXT: imull %edx +; CHECK32-NEXT: shrdl $2, %edx, %ecx +; CHECK32-NEXT: movl %ecx, %eax +; CHECK32-NEXT: retl + %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2); + ret i32 %tmp; +} + +define i64 @func2(i64 %x, i64 %y) { +; CHECK-LABEL: func2: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdi, %rcx +; CHECK-NEXT: imulq %rsi, %rcx +; CHECK-NEXT: imulq %rsi +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shldq $62, %rcx, %rax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func2: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: .cfi_offset %esi, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl %ecx, %eax +; CHECK32-NEXT: mull %esi +; CHECK32-NEXT: shrl $2, %eax +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: addl %ecx, %edx +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: addl %esi, %edx +; CHECK32-NEXT: shll $30, %edx +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2); + ret i64 %tmp; +} + +define i4 @func3(i4 %x, i4 %y) { +; CHECK-LABEL: func3: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: mulb %sil +; CHECK-NEXT: shrb $2, %al +; CHECK-NEXT: movsbl %sil, %ecx +; CHECK-NEXT: movsbl %dil, %edx +; CHECK-NEXT: imull %ecx, %edx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: shlb $6, %dl +; CHECK-NEXT: orb %dl, %al +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func3: +; CHECK32: # %bb.0: +; CHECK32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movsbl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: movl %eax, %ecx +; CHECK32-NEXT: imull %edx, %ecx +; CHECK32-NEXT: shlb $6, %ch +; CHECK32-NEXT: # kill: def $al killed $al killed $eax +; CHECK32-NEXT: mulb %dl +; CHECK32-NEXT: shrb $2, %al +; CHECK32-NEXT: orb %ch, %al +; CHECK32-NEXT: retl + %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2); + ret i4 %tmp; +} + +define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vec: +; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm2, %ecx +; CHECK-NEXT: movslq %ecx, %rdx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: cltq +; CHECK-NEXT: imulq %rdx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: shldl $30, %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: movd %xmm3, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm3, %ecx +; CHECK-NEXT: movslq %ecx, %rdx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: cltq +; CHECK-NEXT: imulq %rdx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: shldl $30, %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm3 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: movslq %ecx, %rdx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: cltq +; CHECK-NEXT: imulq %rdx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: shldl $30, %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: movslq %ecx, %rdx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: cltq +; CHECK-NEXT: imulq %rdx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: shldl $30, %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK32-LABEL: vec: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl %ebp +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: pushl %ebx +; CHECK32-NEXT: .cfi_def_cfa_offset 12 +; CHECK32-NEXT: pushl %edi +; CHECK32-NEXT: .cfi_def_cfa_offset 16 +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 20 +; CHECK32-NEXT: .cfi_offset %esi, -20 +; CHECK32-NEXT: .cfi_offset %edi, -16 +; CHECK32-NEXT: .cfi_offset %ebx, -12 +; CHECK32-NEXT: .cfi_offset %ebp, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl %eax, %ebp +; CHECK32-NEXT: imull %ecx, %ebp +; CHECK32-NEXT: imull %ecx +; CHECK32-NEXT: movl %edx, %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: shldl $30, %ebp, %ecx +; CHECK32-NEXT: movl %eax, %ebp +; CHECK32-NEXT: imull %esi, %ebp +; CHECK32-NEXT: imull %esi +; CHECK32-NEXT: movl %edx, %esi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: shldl $30, %ebp, %esi +; CHECK32-NEXT: movl %eax, %ebp +; CHECK32-NEXT: imull %edi, %ebp +; CHECK32-NEXT: imull %edi +; CHECK32-NEXT: movl %edx, %edi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: shldl $30, %ebp, %edi +; CHECK32-NEXT: movl %eax, %ebp +; CHECK32-NEXT: imull %ebx, %ebp +; CHECK32-NEXT: imull %ebx +; CHECK32-NEXT: shldl $30, %ebp, %edx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl %edx, 12(%eax) +; CHECK32-NEXT: movl %edi, 8(%eax) +; CHECK32-NEXT: movl %esi, 4(%eax) +; CHECK32-NEXT: movl %ecx, (%eax) +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 16 +; CHECK32-NEXT: popl %edi +; CHECK32-NEXT: .cfi_def_cfa_offset 12 +; CHECK32-NEXT: popl %ebx +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: popl %ebp +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2); + ret <4 x i32> %tmp; +} + +; These result in regular integer multiplication +define i32 @func4(i32 %x, i32 %y) { +; CHECK-LABEL: func4: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: imull %esi, %eax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func4: +; CHECK32: # %bb.0: +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: retl + %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 0); + ret i32 %tmp; +} + +define i64 @func5(i64 %x, i64 %y) { +; CHECK-LABEL: func5: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: imulq %rsi, %rax +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func5: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: .cfi_offset %esi, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl %ecx, %eax +; CHECK32-NEXT: mull %esi +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: addl %ecx, %edx +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: addl %esi, %edx +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 0); + ret i64 %tmp; +} + +define i4 @func6(i4 %x, i4 %y) { +; CHECK-LABEL: func6: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: mulb %sil +; CHECK-NEXT: retq +; +; CHECK32-LABEL: func6: +; CHECK32: # %bb.0: +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK32-NEXT: mulb {{[0-9]+}}(%esp) +; CHECK32-NEXT: retl + %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0); + ret i4 %tmp; +} + +define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vec2: +; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm2, %ecx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: movd %xmm3, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm3, %ecx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm3 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: imull %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK32-LABEL: vec2: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl %edi +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 12 +; CHECK32-NEXT: .cfi_offset %esi, -12 +; CHECK32-NEXT: .cfi_offset %edi, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %edi +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl %ecx, 12(%eax) +; CHECK32-NEXT: movl %edx, 8(%eax) +; CHECK32-NEXT: movl %esi, 4(%eax) +; CHECK32-NEXT: movl %edi, (%eax) +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: popl %edi +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0); + ret <4 x i32> %tmp; +}