Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
@@ -518,11 +518,15 @@
   unsigned getMaxInterleaveFactor(unsigned VF) const;
 
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+  /// \p Args is an optional argument which holds the instruction operands  
+  /// values so the TTI can analyize those values searching for special 
+  /// cases\optimizations based on those values.
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
       OperandValueKind Opd2Info = OK_AnyValue,
       OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const;
+      OperandValueProperties Opd2PropInfo = OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) const;
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -763,7 +767,8 @@
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
-                         OperandValueProperties Opd2PropInfo) = 0;
+                         OperandValueProperties Opd2PropInfo,
+                         ArrayRef<const Value *> Args) = 0;
   virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                              Type *SubTp) = 0;
   virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
@@ -984,9 +989,10 @@
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
-                         OperandValueProperties Opd2PropInfo) override {
+                         OperandValueProperties Opd2PropInfo,
+                         ArrayRef<const Value *> Args) override {
     return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+                                       Opd1PropInfo, Opd2PropInfo, Args);
   }
   int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                      Type *SubTp) override {
Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -306,7 +306,8 @@
                                   TTI::OperandValueKind Opd1Info,
                                   TTI::OperandValueKind Opd2Info,
                                   TTI::OperandValueProperties Opd1PropInfo,
-                                  TTI::OperandValueProperties Opd2PropInfo) {
+                                  TTI::OperandValueProperties Opd2PropInfo,
+                                  ArrayRef<const Value *> Args) {
     return 1;
   }
 
@@ -427,6 +428,63 @@
     return VF;
   }
 protected:
+  // Obtain the minimum required size to hold the value (without the sign)
+  // In case of a vector it returns the min required size for one element.
+  unsigned minRequiredElementSize(const Value* Val, bool &isSigned) {
+    if (isa<ConstantDataVector>(Val) || isa<ConstantVector>(Val)) {
+      const auto* VectorValue = cast<Constant>(Val);
+
+      // In case of a vector need to pick the max between the min
+      // required size for each element
+      auto *VT = cast<VectorType>(Val->getType());
+
+      // Assume unsigned elements
+      isSigned = false;
+
+      // The max required size is the total vector width divided by num
+      // of elements in the vector
+      unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements();
+
+      unsigned MinRequiredSize = 0;
+      for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) {
+        if (auto* IntElement =
+              dyn_cast<ConstantInt>(VectorValue->getAggregateElement(i))) {
+          bool signedElement = IntElement->getValue().isNegative();
+          // Get the element min required size.
+          unsigned ElementMinRequiredSize =
+            IntElement->getValue().getMinSignedBits() - 1;
+          // In case one element is signed then all the vector is signed.
+          isSigned |= signedElement;
+          // Save the max required bit size between all the elements.
+          MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize);
+        }
+        else {
+          // not an int constant element
+          return MaxRequiredSize;
+        }
+      }
+      return MinRequiredSize;
+    }
+
+    if (const auto* CI = dyn_cast<ConstantInt>(Val)) {
+      isSigned = CI->getValue().isNegative();
+      return CI->getValue().getMinSignedBits() - 1;
+    }
+
+    if (const auto* Cast = dyn_cast<SExtInst>(Val)) {
+      isSigned = true;
+      return Cast->getSrcTy()->getScalarSizeInBits() - 1;
+    }
+
+    if (const auto* Cast = dyn_cast<ZExtInst>(Val)) {
+      isSigned = false;
+      return Cast->getSrcTy()->getScalarSizeInBits();
+    }
+
+    isSigned = false;
+    return Val->getType()->getScalarSizeInBits();
+  }
+
   bool isStridedAccess(const SCEV *Ptr) {
     return Ptr && isa<SCEVAddRecExpr>(Ptr);
   }
Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
@@ -308,7 +308,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
     // Check if any of the operands are vector operands.
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
Index: llvm/trunk/lib/Analysis/CostModel.cpp
===================================================================
--- llvm/trunk/lib/Analysis/CostModel.cpp
+++ llvm/trunk/lib/Analysis/CostModel.cpp
@@ -438,8 +438,11 @@
       getOperandInfo(I->getOperand(0));
     TargetTransformInfo::OperandValueKind Op2VK =
       getOperandInfo(I->getOperand(1));
+    SmallVector<const Value*, 2> Operands(I->operand_values()); 
     return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
-                                       Op2VK);
+                                       Op2VK, TargetTransformInfo::OP_None, 
+                                       TargetTransformInfo::OP_None, 
+                                       Operands);
   }
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
@@ -277,9 +277,10 @@
 int TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+    OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) const {
   int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                             Opd1PropInfo, Opd2PropInfo);
+                                             Opd1PropInfo, Opd2PropInfo, Args);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -102,7 +102,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -374,7 +374,7 @@
 int AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@
     TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   unsigned getCFInstrCost(unsigned Opcode);
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@
 int AMDGPUTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
 
   EVT OrigTy = TLI->getValueType(DL, Ty);
   if (!OrigTy.isSimple()) {
Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,7 +114,8 @@
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -433,7 +433,8 @@
 int ARMTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
Index: llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ llvm/trunk/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -54,7 +54,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
     switch (ISD) {
Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -54,7 +54,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 };
Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,7 @@
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -71,7 +71,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -281,7 +281,7 @@
 int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
Index: llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
Index: llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,7 @@
 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
 
   unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
       Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
-    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry SLMCostTable[] = {
+    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
+    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
+    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+    { ISD::FMUL, MVT::f64,   2  }, // mulsd
+    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
+    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
+    { ISD::FDIV, MVT::f32,   17 }, // divss
+    { ISD::FDIV, MVT::v4f32, 39 }, // divps
+    { ISD::FDIV, MVT::f64,   32 }, // divsd
+    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+    { ISD::FADD, MVT::v2f64, 2  }, // addpd
+    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
+    // v2i64/v4i64 mul is custom lowered as a series of long
+    // multiplies(3), shifts(3) and adds(2).
+    // slm muldq version throughput is 2
+    { ISD::MUL,  MVT::v2i64, 11 },
+  };
+
+  if (ST->isSLM()) {
+    if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+      // Check if the operands can be shrinked into a smaller datatype.
+      bool Op1Signed = false;
+      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+      bool Op2Signed = false;
+      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+      bool signedMode = Op1Signed | Op2Signed;
+      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+      if (OpMinSize <= 7)
+        return LT.first * 3; // pmullw/sext
+      if (!signedMode && OpMinSize <= 8)
+        return LT.first * 3; // pmullw/zext
+      if (OpMinSize <= 15)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+      if (!signedMode && OpMinSize <= 16)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+    }
+    if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+                                            LT.second)) {
+      return LT.first * Entry->Cost;
+    }
+  }
+
   if (ISD == ISD::SDIV &&
       Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
@@ -6949,9 +6950,9 @@
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
-                                      Op1VP, Op2VP);
+    SmallVector<const Value *, 4> Operands(I->operand_values()); 
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
Index: llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -0,0 +1,317 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; 8bit mul
+define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i8
+  %res = mul nsw i8 %a, %b
+  ret i8 %res
+}
+
+define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i8>
+  %res = mul nsw <2 x i8> %a, %b
+  ret <2 x i8> %res
+}
+
+define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i8>
+  %res = mul nsw <4 x i8> %a, %b
+  ret <4 x i8> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32> 
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 -1, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 256, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 128, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -129, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i8>
+  %res = mul nsw <8 x i8> %a, %b
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b)  {
+entry:
+; SLM:  cost of 14 {{.*}} mul nsw <16 x i8>
+  %res = mul nsw <16 x i8> %a, %b
+  ret <16 x i8> %res
+}
+
+; 16bit mul
+define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i16
+  %res = mul nsw i16 %a, %b
+  ret i16 %res
+}
+
+define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i16>
+  %res = mul nsw <2 x i16> %a, %b
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i16>
+  %res = mul nsw <4 x i16> %a, %b
+  ret <4 x i16> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65535, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 -1, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65536, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32768, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32769>
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i16>
+  %res = mul nsw <8 x i16> %a, %b
+  ret <8 x i16> %res
+}
+
+define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} mul nsw <16 x i16>
+  %res = mul nsw <16 x i16> %a, %b
+  ret <16 x i16> %res
+}
+
+; 32bit mul
+define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i32
+  %res = mul nsw i32 %a, %b
+  ret i32 %res 
+}
+
+define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i32>
+  %res = mul nsw <2 x i32> %a, %b
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %res = mul nsw <4 x i32> %a, %b
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <8 x i32>
+  %res = mul nsw <8 x i32> %a, %b
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <16 x i32>
+  %res = mul nsw <16 x i32> %a, %b
+  ret <16 x i32> %res
+}
+
+; 64bit mul
+define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i64
+  %res = mul nsw i64 %a, %b
+  ret i64 %res
+}
+
+define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i64>
+  %res = mul nsw <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <4 x i64>
+  %res = mul nsw <4 x i64> %a, %b
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <8 x i64>
+  %res = mul nsw <8 x i64> %a, %b
+  ret <8 x i64> %res
+}
+
+define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b)  {
+entry:
+; SLM:  cost of 88 {{.*}} mul nsw <16 x i64>
+  %res = mul nsw <16 x i64> %a, %b
+  ret <16 x i64> %res
+}
+
+; mulsd
+define double @slm-costs_mulsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul double
+  %res = fmul double %a, %b
+  ret double %res
+}
+
+; mulpd
+define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} fmul <2 x double>
+  %res = fmul <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; mulps
+define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul <4 x float>
+  %res = fmul <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divss
+define float @slm-costs_divss(float %a, float %b)  {
+entry:
+; SLM:  cost of 17 {{.*}} fdiv float
+  %res = fdiv float %a, %b
+  ret float %res
+}
+
+; divps
+define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 39 {{.*}} fdiv <4 x float>
+  %res = fdiv <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divsd
+define double @slm-costs_divsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 32 {{.*}} fdiv double
+  %res = fdiv double %a, %b
+  ret double %res
+}
+
+; divpd
+define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 69 {{.*}} fdiv <2 x double>
+  %res = fdiv <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; addpd
+define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fadd <2 x double>
+  %res = fadd <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; subpd
+define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fsub <2 x double>
+  %res = fsub <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
Index: llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,144 @@
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+