Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -512,7 +512,8 @@ unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, OperandValueKind Opd2Info = OK_AnyValue, OperandValueProperties Opd1PropInfo = OP_None, - OperandValueProperties Opd2PropInfo = OP_None) const; + OperandValueProperties Opd2PropInfo = OP_None, + ArrayRef Args = ArrayRef()) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and @@ -752,7 +753,8 @@ getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) = 0; + OperandValueProperties Opd2PropInfo, + ArrayRef Args) = 0; virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0; @@ -972,9 +974,10 @@ getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) override { + OperandValueProperties Opd2PropInfo, + ArrayRef Args) override { return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args); } int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -305,7 +305,8 @@ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { return 1; } @@ -422,6 +423,69 @@ VectorType *VecTy) const { return VF; } +protected: + // Obtain the minimum required size to hold the value (without the sign) + // In case of a vector it returns the min required size for one element. + unsigned minRequiredElementSize(const Value* Val, bool &isSigned) { + const Constant* VectorValue = dyn_cast(Val); + if (isa(Val) || isa(Val)) { + // In case of a vector need to pick the max between the min + // required size for each element + VectorType *VT = dyn_cast(Val->getType()); + assert(VT && "Wrong Vector Type!"); + + // assume unsigned elements + isSigned = false; + + // the max required size is the total vector width divided by num + // of elements in the vector + unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements(); + + unsigned MinRequiredSize = 0; + for(unsigned i = 0; i < VT->getNumElements(); + ++i) { + ConstantInt* IntElement = + dyn_cast(VectorValue->getAggregateElement(i)); + + if (!IntElement) { + // not an int constant element + return MaxRequiredSize; + } + else { + bool signedElement = IntElement->getValue().isNegative(); + // Get the element min required size. + unsigned ElementMinRequiredSize = + IntElement->getValue().getMinSignedBits() - 1; + // In case one element is signed then all the vector is signed. + isSigned |= signedElement; + // Save the max required bit size between all the elements. + if (MinRequiredSize < ElementMinRequiredSize) + MinRequiredSize = ElementMinRequiredSize; + } + } + return MinRequiredSize; + } + + if (const ConstantInt* CI = dyn_cast(Val)) { + isSigned = CI->getValue().isNegative(); + return CI->getValue().getMinSignedBits() - 1; + } + + const CastInst* Cast = dyn_cast(Val); + if (Cast) { + isSigned = true; + return Cast->getSrcTy()->getScalarSizeInBits() - 1; + } + + Cast = dyn_cast(Val); + if (Cast) { + isSigned = false; + return Cast->getSrcTy()->getScalarSizeInBits(); + } + + isSigned = false; + return Val->getType()->getScalarSizeInBits(); + } }; /// \brief CRTP base class for use as a mix-in that aids implementing Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -307,7 +307,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()) { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); Index: include/llvm/IR/User.h =================================================================== --- include/llvm/IR/User.h +++ include/llvm/IR/User.h @@ -238,6 +238,25 @@ return make_range(value_op_begin(), value_op_end()); } + struct const_value_op_iterator + : iterator_adaptor_base { + explicit const_value_op_iterator(const Use *U = nullptr) : + iterator_adaptor_base(U) {} + const Value *operator*() const { return *I; } + const Value *operator->() const { return operator*(); } + }; + const_value_op_iterator value_op_begin() const { + return const_value_op_iterator(op_begin()); + } + const_value_op_iterator value_op_end() const { + return const_value_op_iterator(op_end()); + } + iterator_range operand_values() const { + return make_range(value_op_begin(), value_op_end()); + } + /// \brief Drop all references to operands. /// /// This function is in charge of "letting go" of all objects that this User Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -417,6 +417,14 @@ getOperandInfo(I->getOperand(0)); TargetTransformInfo::OperandValueKind Op2VK = getOperandInfo(I->getOperand(1)); + if (I->getOpcode() == Instruction::Mul) { + SmallVector Operands(I->value_op_begin(), + I->value_op_end()); + return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, + Op2VK, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None, + Operands); + } return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, Op2VK); } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -277,9 +277,10 @@ int TargetTransformInfo::getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) const { + OperandValueProperties Opd2PropInfo, + ArrayRef Args) const { int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -102,7 +102,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getAddressComputationCost(Type *Ty, bool IsComplex); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -374,7 +374,7 @@ int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -83,7 +83,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); unsigned getCFInstrCost(unsigned Opcode); Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -110,7 +110,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args ) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -113,7 +113,8 @@ TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -430,7 +430,8 @@ int ARMTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: lib/Target/Lanai/LanaiTargetTransformInfo.h =================================================================== --- lib/Target/Lanai/LanaiTargetTransformInfo.h +++ lib/Target/Lanai/LanaiTargetTransformInfo.h @@ -54,7 +54,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()) { int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -54,7 +54,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); }; Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -115,7 +115,7 @@ int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -71,7 +71,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -281,7 +281,7 @@ int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h =================================================================== --- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -61,7 +61,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); /// @} Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp =================================================================== --- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { unsigned Cost = BasicTTIImplBase::getArithmeticInstrCost( Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -60,7 +60,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -111,15 +111,65 @@ } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry SLMCostTable[] = { + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd + }; + + if (ST->isSLM()) { + if (ISD == ISD::MUL && LT.second == MVT::v4i32 ) { + if (Args.size() == 2) { + // Check if the operands can be shrinked into a smaller datatype. + bool Op1Signed = false; + unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); + bool Op2Signed = false; + unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); + + bool signedMode = Op1Signed | Op2Signed; + unsigned OpMinSize = + (Op1MinSize > Op2MinSize ? Op1MinSize : Op2MinSize); + + if (OpMinSize <= 7) { + return LT.first * 3; // pmullw/sext + } + if (!signedMode && OpMinSize <= 8) { + return LT.first * 3; // pmullw/zext + } + if (OpMinSize <= 15) { + return LT.first * 5; // pmulhw/pmullw/pshuf + } + if (!signedMode && OpMinSize <= 16) { + return LT.first * 5; // pmulhw/pmullw/pshuf + } + } + } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, + LT.second)) { + return LT.first * Entry->Cost; + } + } + if (ISD == ISD::SDIV && Op2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -574,6 +624,16 @@ { ISD::MUL, MVT::v4i64, 8 }, { ISD::MUL, MVT::v8i64, 8 } }; + static const CostTblEntry CustomLoweredSLM[] = { + // A v2i64/v4i64 and multiply is custom lowered as a series of long + // multiplies(3), shifts(3) and adds(2). + // slm muldq version throughput is 2 + { ISD::MUL, MVT::v2i64, 11 }, + }; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(CustomLoweredSLM, ISD, + LT.second)) + return LT.first * Entry->Cost; if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) return LT.first * Entry->Cost; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -80,6 +80,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" @@ -6968,7 +6969,12 @@ } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - + if (I->getOpcode() == Instruction::Mul) { + SmallVector Operands(I->value_op_begin(), + I->value_op_end()); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); + } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, Op1VP, Op2VP); } Index: test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- test/Analysis/CostModel/X86/slm-arith-costs.ll +++ test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -0,0 +1,291 @@ +; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; 8bit mul +define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i8 + %res = mul nsw i8 %a, %b + ret i8 %res +} + +define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i8> + %res = mul nsw <2 x i8> %a, %b + ret <2 x i8> %res +} + +define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i8> + %res = mul nsw <4 x i8> %a, %b + ret <4 x i8> %res +} + +define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) { +entry: +; SLM: cost of 2 {{.*}} mul nsw <8 x i8> + %res = mul nsw <8 x i8> %a, %b + ret <8 x i8> %res +} + +define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) { +entry: +; SLM: cost of 14 {{.*}} mul nsw <16 x i8> + %res = mul nsw <16 x i8> %a, %b + ret <16 x i8> %res +} + +; 16bit mul +define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i16 + %res = mul nsw i16 %a, %b + ret i16 %res +} + +define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i16> + %res = mul nsw <2 x i16> %a, %b + ret <2 x i16> %res +} + +define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i16> + %res = mul nsw <4 x i16> %a, %b + ret <4 x i16> %res +} + +define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b) { +entry: +; SLM: cost of 2 {{.*}} mul nsw <8 x i16> + %res = mul nsw <8 x i16> %a, %b + ret <8 x i16> %res +} + +define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b) { +entry: +; SLM: cost of 4 {{.*}} mul nsw <16 x i16> + %res = mul nsw <16 x i16> %a, %b + ret <16 x i16> %res +} + +; 32bit mul +define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i32 + %res = mul nsw i32 %a, %b + ret i32 %res +} + +define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i32> + %res = mul nsw <2 x i32> %a, %b + ret <2 x i32> %res +} + +define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %res = mul nsw <4 x i32> %a, %b + ret <4 x i32> %res +} + +define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b) { +entry: +; SLM: cost of 22 {{.*}} mul nsw <8 x i32> + %res = mul nsw <8 x i32> %a, %b + ret <8 x i32> %res +} + +define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b) { +entry: +; SLM: cost of 44 {{.*}} mul nsw <16 x i32> + %res = mul nsw <16 x i32> %a, %b + ret <16 x i32> %res +} + +; 64bit mul +define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i64 + %res = mul nsw i64 %a, %b + ret i64 %res +} + +define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i64> + %res = mul nsw <2 x i64> %a, %b + ret <2 x i64> %res +} + +define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) { +entry: +; SLM: cost of 22 {{.*}} mul nsw <4 x i64> + %res = mul nsw <4 x i64> %a, %b + ret <4 x i64> %res +} + +define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) { +entry: +; SLM: cost of 44 {{.*}} mul nsw <8 x i64> + %res = mul nsw <8 x i64> %a, %b + ret <8 x i64> %res +} + +define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) { +entry: +; SLM: cost of 88 {{.*}} mul nsw <16 x i64> + %res = mul nsw <16 x i64> %a, %b + ret <16 x i64> %res +} + +; mulsd +define double @slm-costs_mulsd(double %a, double %b) { +entry: +; SLM: cost of 2 {{.*}} fmul double + %res = fmul double %a, %b + ret double %res +} + +; mulpd +define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 4 {{.*}} fmul <2 x double> + %res = fmul <2 x double> %a, %b + ret <2 x double> %res +} + +; mulps +define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b) { +entry: +; SLM: cost of 2 {{.*}} fmul <4 x float> + %res = fmul <4 x float> %a, %b + ret <4 x float> %res +} + +; divss +define float @slm-costs_divss(float %a, float %b) { +entry: +; SLM: cost of 17 {{.*}} fdiv float + %res = fdiv float %a, %b + ret float %res +} + +; divps +define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b) { +entry: +; SLM: cost of 39 {{.*}} fdiv <4 x float> + %res = fdiv <4 x float> %a, %b + ret <4 x float> %res +} + +; divsd +define double @slm-costs_divsd(double %a, double %b) { +entry: +; SLM: cost of 32 {{.*}} fdiv double + %res = fdiv double %a, %b + ret double %res +} + +; divpd +define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 69 {{.*}} fdiv <2 x double> + %res = fdiv <2 x double> %a, %b + ret <2 x double> %res +} + +; addpd +define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 2 {{.*}} fadd <2 x double> + %res = fadd <2 x double> %a, %b + ret <2 x double> %res +} + +; subpd +define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 2 {{.*}} fsub <2 x double> + %res = fsub <2 x double> %a, %b + ret <2 x double> %res +} + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0 (cfe/trunk 287996)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} Index: test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll =================================================================== --- test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll +++ test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll @@ -0,0 +1,144 @@ +; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) { +entry: + %cmp12 = icmp eq i32 %N, 0 + br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = trunc i32 %add4 to i8 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + ret i8 %acc.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = sext i8 %1 to i32 +; sources of the mul is sext\sext from i8 +; use pmullw\sext seq. +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul = mul nsw i32 %conv3, %conv +; sources of the mul is zext\sext from i8 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %conv4 = zext i8 %1 to i32 + %mul2 = mul nsw i32 %conv4, %conv + %sum0 = add i32 %mul, %mul2 +; sources of the mul is zext\zext from i8 +; use pmullw\zext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %conv5 = zext i8 %0 to i32 + %mul3 = mul nsw i32 %conv5, %conv4 + %sum1 = add i32 %sum0, %mul3 +; sources of the mul is sext\-120 +; use pmullw\sext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul4 = mul nsw i32 -120, %conv3 + %sum2 = add i32 %sum1, %mul4 +; sources of the mul is sext\250 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul5 = mul nsw i32 250, %conv3 + %sum3 = add i32 %sum2, %mul5 +; sources of the mul is zext\-120 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul6 = mul nsw i32 -120, %conv4 + %sum4 = add i32 %sum3, %mul6 +; sources of the mul is zext\250 +; use pmullw\zext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul7 = mul nsw i32 250, %conv4 + %sum5 = add i32 %sum4, %mul7 + %add = add i32 %acc.013, 5 + %add4 = add i32 %add, %sum5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) { +entry: + %cmp12 = icmp eq i32 %N, 0 + br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = trunc i32 %add4 to i16 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + ret i16 %acc.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 1 + %conv = sext i16 %0 to i32 + %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv + %1 = load i16, i16* %arrayidx2, align 1 + %conv3 = sext i16 %1 to i32 +; sources of the mul is sext\sext from i16 +; use pmulhw\pmullw\pshuf seq. +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul = mul nsw i32 %conv3, %conv +; sources of the mul is zext\sext from i16 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %conv4 = zext i16 %1 to i32 + %mul2 = mul nsw i32 %conv4, %conv + %sum0 = add i32 %mul, %mul2 +; sources of the mul is zext\zext from i16 +; use pmulhw\pmullw\zext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %conv5 = zext i16 %0 to i32 + %mul3 = mul nsw i32 %conv5, %conv4 + %sum1 = add i32 %sum0, %mul3 +; sources of the mul is sext\-32000 +; use pmulhw\pmullw\sext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul4 = mul nsw i32 -32000, %conv3 + %sum2 = add i32 %sum1, %mul4 +; sources of the mul is sext\64000 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %mul5 = mul nsw i32 64000, %conv3 + %sum3 = add i32 %sum2, %mul5 +; sources of the mul is zext\-32000 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %mul6 = mul nsw i32 -32000, %conv4 + %sum4 = add i32 %sum3, %mul6 +; sources of the mul is zext\64000 +; use pmulhw\pmullw\zext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul7 = mul nsw i32 250, %conv4 + %sum5 = add i32 %sum4, %mul7 + %add = add i32 %acc.013, 5 + %add4 = add i32 %add, %sum5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +