Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -518,11 +518,15 @@ unsigned getMaxInterleaveFactor(unsigned VF) const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. + /// \p Args is an optional argument which holds the instruction operands + /// values so the TTI can analyize those values searching for special + /// cases\optimizations based on those values. int getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, OperandValueKind Opd2Info = OK_AnyValue, OperandValueProperties Opd1PropInfo = OP_None, - OperandValueProperties Opd2PropInfo = OP_None) const; + OperandValueProperties Opd2PropInfo = OP_None, + ArrayRef Args = ArrayRef()) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and @@ -763,7 +767,8 @@ getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) = 0; + OperandValueProperties Opd2PropInfo, + ArrayRef Args) = 0; virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0; @@ -984,9 +989,10 @@ getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) override { + OperandValueProperties Opd2PropInfo, + ArrayRef Args) override { return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args); } int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) override { Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -306,7 +306,8 @@ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { return 1; } @@ -427,6 +428,63 @@ return VF; } protected: + // Obtain the minimum required size to hold the value (without the sign) + // In case of a vector it returns the min required size for one element. + unsigned minRequiredElementSize(const Value* Val, bool &isSigned) { + if (isa(Val) || isa(Val)) { + const auto* VectorValue = cast(Val); + + // In case of a vector need to pick the max between the min + // required size for each element + auto *VT = cast(Val->getType()); + + // Assume unsigned elements + isSigned = false; + + // The max required size is the total vector width divided by num + // of elements in the vector + unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements(); + + unsigned MinRequiredSize = 0; + for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) { + if (auto* IntElement = + dyn_cast(VectorValue->getAggregateElement(i))) { + bool signedElement = IntElement->getValue().isNegative(); + // Get the element min required size. + unsigned ElementMinRequiredSize = + IntElement->getValue().getMinSignedBits() - 1; + // In case one element is signed then all the vector is signed. + isSigned |= signedElement; + // Save the max required bit size between all the elements. + MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize); + } + else { + // not an int constant element + return MaxRequiredSize; + } + } + return MinRequiredSize; + } + + if (const auto* CI = dyn_cast(Val)) { + isSigned = CI->getValue().isNegative(); + return CI->getValue().getMinSignedBits() - 1; + } + + if (const auto* Cast = dyn_cast(Val)) { + isSigned = true; + return Cast->getSrcTy()->getScalarSizeInBits() - 1; + } + + if (const auto* Cast = dyn_cast(Val)) { + isSigned = false; + return Cast->getSrcTy()->getScalarSizeInBits(); + } + + isSigned = false; + return Val->getType()->getScalarSizeInBits(); + } + bool isStridedAccess(const SCEV *Ptr) { return Ptr && isa(Ptr); } Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -308,7 +308,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()) { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); Index: llvm/trunk/lib/Analysis/CostModel.cpp =================================================================== --- llvm/trunk/lib/Analysis/CostModel.cpp +++ llvm/trunk/lib/Analysis/CostModel.cpp @@ -438,8 +438,11 @@ getOperandInfo(I->getOperand(0)); TargetTransformInfo::OperandValueKind Op2VK = getOperandInfo(I->getOperand(1)); + SmallVector Operands(I->operand_values()); return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, - Op2VK); + Op2VK, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None, + Operands); } case Instruction::Select: { const SelectInst *SI = cast(I); Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -277,9 +277,10 @@ int TargetTransformInfo::getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) const { + OperandValueProperties Opd2PropInfo, + ArrayRef Args) const { int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1PropInfo, Opd2PropInfo, Args); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -102,7 +102,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -374,7 +374,7 @@ int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -83,7 +83,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); unsigned getCFInstrCost(unsigned Opcode); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -110,7 +110,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args ) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h @@ -114,7 +114,8 @@ TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -433,7 +433,8 @@ int ARMTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h +++ llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h @@ -54,7 +54,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()) { int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -54,7 +54,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); }; Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -115,7 +115,7 @@ int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -71,7 +71,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -281,7 +281,7 @@ int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. Index: llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -61,7 +61,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); /// @} Index: llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { unsigned Cost = BasicTTIImplBase::getArithmeticInstrCost( Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -60,7 +60,8 @@ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -114,15 +114,62 @@ } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry SLMCostTable[] = { + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd + // v2i64/v4i64 mul is custom lowered as a series of long + // multiplies(3), shifts(3) and adds(2). + // slm muldq version throughput is 2 + { ISD::MUL, MVT::v2i64, 11 }, + }; + + if (ST->isSLM()) { + if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { + // Check if the operands can be shrinked into a smaller datatype. + bool Op1Signed = false; + unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); + bool Op2Signed = false; + unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); + + bool signedMode = Op1Signed | Op2Signed; + unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); + + if (OpMinSize <= 7) + return LT.first * 3; // pmullw/sext + if (!signedMode && OpMinSize <= 8) + return LT.first * 3; // pmullw/zext + if (OpMinSize <= 15) + return LT.first * 5; // pmullw/pmulhw/pshuf + if (!signedMode && OpMinSize <= 16) + return LT.first * 5; // pmullw/pmulhw/pshuf + } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, + LT.second)) { + return LT.first * Entry->Cost; + } + } + if (ISD == ISD::SDIV && Op2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -80,6 +80,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" @@ -6949,9 +6950,9 @@ } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, - Op1VP, Op2VP); + SmallVector Operands(I->operand_values()); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast(I); Index: llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -0,0 +1,317 @@ +; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; 8bit mul +define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i8 + %res = mul nsw i8 %a, %b + ret i8 %res +} + +define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i8> + %res = mul nsw <2 x i8> %a, %b + ret <2 x i8> %res +} + +define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i8> + %res = mul nsw <4 x i8> %a, %b + ret <4 x i8> %res +} + +define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a) { +entry: +; SLM: cost of 3 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i8> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) { +entry: +; SLM: cost of 2 {{.*}} mul nsw <8 x i8> + %res = mul nsw <8 x i8> %a, %b + ret <8 x i8> %res +} + +define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) { +entry: +; SLM: cost of 14 {{.*}} mul nsw <16 x i8> + %res = mul nsw <16 x i8> %a, %b + ret <16 x i8> %res +} + +; 16bit mul +define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i16 + %res = mul nsw i16 %a, %b + ret i16 %res +} + +define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i16> + %res = mul nsw <2 x i16> %a, %b + ret <2 x i16> %res +} + +define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i16> + %res = mul nsw <4 x i16> %a, %b + ret <4 x i16> %res +} + +define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %zext = zext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %zext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a) { +entry: +; SLM: cost of 5 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %sext = sext <4 x i16> %a to <4 x i32> + %res = mul nsw <4 x i32> %sext, + ret <4 x i32> %res +} + +define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b) { +entry: +; SLM: cost of 2 {{.*}} mul nsw <8 x i16> + %res = mul nsw <8 x i16> %a, %b + ret <8 x i16> %res +} + +define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b) { +entry: +; SLM: cost of 4 {{.*}} mul nsw <16 x i16> + %res = mul nsw <16 x i16> %a, %b + ret <16 x i16> %res +} + +; 32bit mul +define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i32 + %res = mul nsw i32 %a, %b + ret i32 %res +} + +define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i32> + %res = mul nsw <2 x i32> %a, %b + ret <2 x i32> %res +} + +define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <4 x i32> + %res = mul nsw <4 x i32> %a, %b + ret <4 x i32> %res +} + +define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b) { +entry: +; SLM: cost of 22 {{.*}} mul nsw <8 x i32> + %res = mul nsw <8 x i32> %a, %b + ret <8 x i32> %res +} + +define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b) { +entry: +; SLM: cost of 44 {{.*}} mul nsw <16 x i32> + %res = mul nsw <16 x i32> %a, %b + ret <16 x i32> %res +} + +; 64bit mul +define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b) { +entry: +; SLM: cost of 1 {{.*}} mul nsw i64 + %res = mul nsw i64 %a, %b + ret i64 %res +} + +define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 11 {{.*}} mul nsw <2 x i64> + %res = mul nsw <2 x i64> %a, %b + ret <2 x i64> %res +} + +define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) { +entry: +; SLM: cost of 22 {{.*}} mul nsw <4 x i64> + %res = mul nsw <4 x i64> %a, %b + ret <4 x i64> %res +} + +define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) { +entry: +; SLM: cost of 44 {{.*}} mul nsw <8 x i64> + %res = mul nsw <8 x i64> %a, %b + ret <8 x i64> %res +} + +define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) { +entry: +; SLM: cost of 88 {{.*}} mul nsw <16 x i64> + %res = mul nsw <16 x i64> %a, %b + ret <16 x i64> %res +} + +; mulsd +define double @slm-costs_mulsd(double %a, double %b) { +entry: +; SLM: cost of 2 {{.*}} fmul double + %res = fmul double %a, %b + ret double %res +} + +; mulpd +define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 4 {{.*}} fmul <2 x double> + %res = fmul <2 x double> %a, %b + ret <2 x double> %res +} + +; mulps +define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b) { +entry: +; SLM: cost of 2 {{.*}} fmul <4 x float> + %res = fmul <4 x float> %a, %b + ret <4 x float> %res +} + +; divss +define float @slm-costs_divss(float %a, float %b) { +entry: +; SLM: cost of 17 {{.*}} fdiv float + %res = fdiv float %a, %b + ret float %res +} + +; divps +define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b) { +entry: +; SLM: cost of 39 {{.*}} fdiv <4 x float> + %res = fdiv <4 x float> %a, %b + ret <4 x float> %res +} + +; divsd +define double @slm-costs_divsd(double %a, double %b) { +entry: +; SLM: cost of 32 {{.*}} fdiv double + %res = fdiv double %a, %b + ret double %res +} + +; divpd +define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 69 {{.*}} fdiv <2 x double> + %res = fdiv <2 x double> %a, %b + ret <2 x double> %res +} + +; addpd +define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 2 {{.*}} fadd <2 x double> + %res = fadd <2 x double> %a, %b + ret <2 x double> %res +} + +; subpd +define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b) { +entry: +; SLM: cost of 2 {{.*}} fsub <2 x double> + %res = fsub <2 x double> %a, %b + ret <2 x double> %res +} + Index: llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll @@ -0,0 +1,144 @@ +; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) { +entry: + %cmp12 = icmp eq i32 %N, 0 + br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = trunc i32 %add4 to i8 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + ret i8 %acc.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = sext i8 %1 to i32 +; sources of the mul is sext\sext from i8 +; use pmullw\sext seq. +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul = mul nsw i32 %conv3, %conv +; sources of the mul is zext\sext from i8 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %conv4 = zext i8 %1 to i32 + %mul2 = mul nsw i32 %conv4, %conv + %sum0 = add i32 %mul, %mul2 +; sources of the mul is zext\zext from i8 +; use pmullw\zext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %conv5 = zext i8 %0 to i32 + %mul3 = mul nsw i32 %conv5, %conv4 + %sum1 = add i32 %sum0, %mul3 +; sources of the mul is sext\-120 +; use pmullw\sext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul4 = mul nsw i32 -120, %conv3 + %sum2 = add i32 %sum1, %mul4 +; sources of the mul is sext\250 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul5 = mul nsw i32 250, %conv3 + %sum3 = add i32 %sum2, %mul5 +; sources of the mul is zext\-120 +; use pmulhw\pmullw\pshuf +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul6 = mul nsw i32 -120, %conv4 + %sum4 = add i32 %sum3, %mul6 +; sources of the mul is zext\250 +; use pmullw\zext +; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 + %mul7 = mul nsw i32 250, %conv4 + %sum5 = add i32 %sum4, %mul7 + %add = add i32 %acc.013, 5 + %add4 = add i32 %add, %sum5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) { +entry: + %cmp12 = icmp eq i32 %N, 0 + br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = trunc i32 %add4 to i16 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + ret i16 %acc.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 1 + %conv = sext i16 %0 to i32 + %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv + %1 = load i16, i16* %arrayidx2, align 1 + %conv3 = sext i16 %1 to i32 +; sources of the mul is sext\sext from i16 +; use pmulhw\pmullw\pshuf seq. +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul = mul nsw i32 %conv3, %conv +; sources of the mul is zext\sext from i16 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %conv4 = zext i16 %1 to i32 + %mul2 = mul nsw i32 %conv4, %conv + %sum0 = add i32 %mul, %mul2 +; sources of the mul is zext\zext from i16 +; use pmulhw\pmullw\zext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %conv5 = zext i16 %0 to i32 + %mul3 = mul nsw i32 %conv5, %conv4 + %sum1 = add i32 %sum0, %mul3 +; sources of the mul is sext\-32000 +; use pmulhw\pmullw\sext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul4 = mul nsw i32 -32000, %conv3 + %sum2 = add i32 %sum1, %mul4 +; sources of the mul is sext\64000 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %mul5 = mul nsw i32 64000, %conv3 + %sum3 = add i32 %sum2, %mul5 +; sources of the mul is zext\-32000 +; use pmulld +; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 + %mul6 = mul nsw i32 -32000, %conv4 + %sum4 = add i32 %sum3, %mul6 +; sources of the mul is zext\64000 +; use pmulhw\pmullw\zext +; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 + %mul7 = mul nsw i32 250, %conv4 + %sum5 = add i32 %sum4, %mul7 + %add = add i32 %acc.013, 5 + %add4 = add i32 %add, %sum5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +