Index: include/llvm/Analysis/VectorUtils.h
===================================================================
--- include/llvm/Analysis/VectorUtils.h
+++ include/llvm/Analysis/VectorUtils.h
@@ -28,6 +28,7 @@
 class TargetTransformInfo;
 class Type;
 class Value;
+class VectorVariant;
 
 namespace Intrinsic {
 enum ID : unsigned;
@@ -176,6 +177,10 @@
 /// elements, it will be padded with undefs.
 Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs);
 
+/// \brief Determine the characteristic type of the vector function as specified
+/// according to the vector function ABI.
+Type* calcCharacteristicType(Function& F, VectorVariant& Variant);
+
 } // llvm namespace
 
 #endif
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1730,6 +1730,10 @@
             TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
           continue;
 
+        if (Call &&
+            Call->getCalledFunction()->hasFnAttribute("vector-variants"))
+          continue;
+
         auto *Ld = dyn_cast<LoadInst>(&I);
         if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
           recordAnalysis("NonSimpleLoad", Ld)
Index: lib/Analysis/VectorUtils.cpp
===================================================================
--- lib/Analysis/VectorUtils.cpp
+++ lib/Analysis/VectorUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorVariant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
@@ -574,3 +575,47 @@
 
   return ResList[0];
 }
+
+Type *llvm::calcCharacteristicType(Function &F, VectorVariant &Variant) {
+  Type *ReturnType = F.getReturnType();
+  Type *CharacteristicDataType = nullptr;
+
+  if (!ReturnType->isVoidTy())
+    CharacteristicDataType = ReturnType;
+
+  if (!CharacteristicDataType) {
+
+    std::vector<VectorKind> &ParmKinds = Variant.getParameters();
+    Function::const_arg_iterator ArgIt = F.arg_begin();
+    Function::const_arg_iterator ArgEnd = F.arg_end();
+    std::vector<VectorKind>::iterator VKIt = ParmKinds.begin();
+
+    for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) {
+      if (VKIt->isVector()) {
+        CharacteristicDataType = (*ArgIt).getType();
+        break;
+      }
+    }
+  }
+
+  // TODO except Clang's ComplexType
+  if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) {
+    CharacteristicDataType = Type::getInt32Ty(F.getContext());
+  }
+
+  // Legalize the characteristic type based on target requirements.
+  CharacteristicDataType =
+      Variant.promoteToSupportedType(CharacteristicDataType);
+
+  if (CharacteristicDataType->isPointerTy()) {
+    // For such cases as 'int* foo(int x)', where x is a non-vector type, the
+    // characteristic type at this point will be i32*. If we use the DataLayout
+    // to query the supported pointer size, then a promotion to i64* is
+    // incorrect because the mask element type will mismatch the element type
+    // of the characteristic type.
+    PointerType *PointerTy = cast<PointerType>(CharacteristicDataType);
+    CharacteristicDataType = PointerTy->getElementType();
+  }
+
+  return CharacteristicDataType;
+}
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -83,6 +83,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/VectorVariant.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -3857,7 +3858,8 @@
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
   NeedToScalarize = true;
-  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+  if ((!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) &&
+      !CI->getCalledFunction()->hasFnAttribute("vector-variants"))
     return Cost;
 
   // If the corresponding vector cost is cheaper, return its cost.
@@ -4692,6 +4694,58 @@
   return !CInt || CInt->isZero();
 }
 
+static VectorVariant* matchVectorVariant(Function *CalledFunc, unsigned VF,
+                                         bool IsMasked,
+                                         const TargetTransformInfo *TTI) {
+
+  DEBUG(dbgs() << "\nCall VF: " << VF << "\n");
+  unsigned TargetMaxRegWidth = TTI->getRegisterBitWidth(true);
+  DEBUG(dbgs() << "Target Max Register Width: " << TargetMaxRegWidth << "\n");
+
+  TargetTransformInfo::ISAClass TargetIsaClass =
+    TTI->getISAClassForMaxVecRegSize();
+  DEBUG(dbgs() << "Target ISA Class: "
+               << TTI->ISAClassToString(TargetIsaClass) << "\n\n");
+
+  Attribute Attr = CalledFunc->getFnAttribute("vector-variants");
+  StringRef VariantsStr = Attr.getValueAsString();
+  SmallVector<StringRef, 4> Variants;
+  VariantsStr.split(Variants, ",");
+  for (unsigned i = 0; i < Variants.size(); i++) {
+    VectorVariant *Variant = new VectorVariant(Variants[i], TTI);
+    TargetTransformInfo::ISAClass VariantIsaClass = Variant->getISA();
+    DEBUG(dbgs() << "Variant ISA Class: "
+                 << TTI->ISAClassToString(VariantIsaClass) << "\n");
+    unsigned IsaClassMaxRegWidth =
+      TTI->ISAClassMaxRegisterWidth(VariantIsaClass);
+    DEBUG(dbgs() << "Isa Class Max Vector Register Width: "
+                 << IsaClassMaxRegWidth << "\n");
+    unsigned FuncVF = Variant->getVlen();
+    DEBUG(dbgs() << "Func VF: " << FuncVF << "\n\n");
+
+    // Pick candidate functions based on the target ISA class, masked property,
+    // and loop VF == Variant VF. For now, matching is limited to exact matching
+    // on VF. If no match exits based on these criteria, the calls will be
+    // scalarized. This could be extended in the future for when:
+    //
+    // 1) the only available simd function variants have a VF that is less than
+    //    the loop VF. In this case, multiple calls can be made to the simd
+    //    function. Currently, however, LV only keeps a 1-1 scalar -> vector
+    //    Value mapping.
+    //
+    // 2) the only available simd function variants have a VF that is greater
+    //    than the loop VF. In this case, we can make the call to the simd
+    //    function and effectively mask off the unused vector parts.
+    //
+    if (VariantIsaClass == TargetIsaClass && Variant->isMasked() == IsMasked &&
+        VF == Variant->getVlen()) {
+      return Variant;
+    }
+  }
+
+  return nullptr;
+}
+
 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   switch (I.getOpcode()) {
   case Instruction::Br:
@@ -4895,13 +4949,42 @@
 
     Module *M = I.getParent()->getParent()->getParent();
     auto *CI = cast<CallInst>(&I);
-
     StringRef FnName = CI->getCalledFunction()->getName();
     Function *F = CI->getCalledFunction();
+
+    // Find the appropriate simd function match and generate the mask if this
+    // is a masked simd function.
+    bool UseSimdFunction = F->hasFnAttribute("vector-variants") ? true : false;
+    std::vector<VectorKind> SimdFuncParms;
+    VectorVariant *SimdVariant = nullptr;
+    bool IsMasked = Legal->isMaskRequired(&I);
+    Type *CharacteristicTy = nullptr;
+    VectorParts Mask;
+    if (UseSimdFunction) {
+      SimdVariant = matchVectorVariant(F, VF, IsMasked, TTI);
+      DEBUG(dbgs() << "Matched Variant: " << SimdVariant->encode() << "\n");
+      SimdFuncParms = SimdVariant->getParameters();
+      CharacteristicTy = calcCharacteristicType(*F, *SimdVariant);
+      if (IsMasked) {
+        Mask = createBlockInMask(CI->getParent());
+      }
+    }
+
     Type *RetTy = ToVectorTy(CI->getType(), VF);
     SmallVector<Type *, 4> Tys;
-    for (Value *ArgOperand : CI->arg_operands())
-      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      Value *ArgOperand = CI->getArgOperand(i);
+      if (!UseSimdFunction || SimdFuncParms[i].isVector())
+        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+      else
+        Tys.push_back(ArgOperand->getType());
+    }
+
+    // Masked simd functions need an extra mask parameter, so add its type to
+    // the Tys list.
+    if (UseSimdFunction && IsMasked) {
+      Tys.push_back(ToVectorTy(CharacteristicTy, VF));
+    }
 
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
@@ -4920,12 +5003,41 @@
       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
         Value *Arg = CI->getArgOperand(i);
         // Some intrinsics have a scalar argument - don't replace it with a
-        // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+        // vector. Likewise, linear and uniform parameters for simd functions
+        // are passed as scalars, so don't vectorize those either.
+        bool IsScalarSimdArg = UseSimdFunction &&
+          (SimdFuncParms[i].isLinear() || SimdFuncParms[i].isUniform());
+        if (IsScalarSimdArg)
+          Arg = getOrCreateScalarValue(Arg, {Part, 0});
+        else if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
         Args.push_back(Arg);
       }
 
+      // Promote mask of <VF x i1> mask to <VF x characteristic type> and add
+      // it to the Args list.
+      if (UseSimdFunction && IsMasked) {
+        unsigned CharacteristicTySize =
+          CharacteristicTy->getPrimitiveSizeInBits();
+        // Mask is vector of i1. Promote it to an integer type that has the
+        // same size as the characteristic type.
+        Type *ScalarToType = IntegerType::get(CharacteristicTy->getContext(),
+                                              CharacteristicTySize);
+        VectorType *VecToType = VectorType::get(ScalarToType, VF);
+        Value *MaskExt = Builder.CreateSExt(Mask[Part], VecToType, "mask.ext");
+
+        // Bitcast if the promoted type is not the same as the characteristic
+        // type.
+        if (ScalarToType != CharacteristicTy) {
+          Type *MaskCastTy = VectorType::get(CharacteristicTy, VF);
+          Value *MaskCast = Builder.CreateBitCast(MaskExt, MaskCastTy,
+                                                  "mask.cast");
+          Args.push_back(MaskCast);
+        } else {
+          Args.push_back(MaskExt);
+        }
+      }
+
       Function *VectorF;
       if (UseVectorIntrinsic) {
         // Use vector version of the intrinsic.
@@ -4934,8 +5046,13 @@
           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       } else {
-        // Use vector version of the library call.
-        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        // Use vector version of the library call or a simd function.
+        StringRef VFnName;
+        if (UseSimdFunction) {
+          std::string VariantName = SimdVariant->encode() + FnName.str();
+          VFnName = VariantName;
+        } else
+          VFnName = TLI->getVectorizedFunction(FnName, VF);
         assert(!VFnName.empty() && "Vector function name is empty.");
         VectorF = M->getFunction(VFnName);
         if (!VectorF) {
@@ -4959,6 +5076,13 @@
       addMetadata(V, &I);
     }
 
+    if (UseSimdFunction) {
+      // Remove simd function attributes from the original function just
+      // in case VecClone runs again.
+      F->removeFnAttr("vector-variants");
+      delete SimdVariant;
+    }
+
     break;
   }
 
@@ -5357,7 +5481,8 @@
       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
           !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
-            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName())) &&
+          !(CI->getCalledFunction()->hasFnAttribute("vector-variants"))) {
         ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
                   << "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
@@ -5885,6 +6010,12 @@
         if (C->canTrap())
           return false;
     }
+    // We will need a mask for masked SIMD functions.
+    auto *CI = dyn_cast<CallInst>(&I);
+    if (CI && CI->getCalledFunction()->hasFnAttribute("vector-variants")) {
+      MaskedOp.insert(CI);
+      continue;
+    }
     // We might be able to hoist the load.
     if (I.mayReadFromMemory()) {
       auto *LI = dyn_cast<LoadInst>(&I);
@@ -8150,6 +8281,17 @@
       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
       bool UseVectorIntrinsic =
           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+
+      // If a match for the simd function was not found, then just scalarize the
+      // calls to the original function.
+      Function *F = CI->getCalledFunction();
+      if (F->hasFnAttribute("vector-variants")) {
+        bool IsMasked = Legal->isMaskRequired(I);
+        VectorVariant *SimdVariant = matchVectorVariant(F, VF, IsMasked, TTI);
+        if (!SimdVariant)
+          return false;
+      }
+
       return UseVectorIntrinsic || !NeedToScalarize;
     }
     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
Index: test/Transforms/LoopVectorize/masked_simd_func.ll
===================================================================
--- test/Transforms/LoopVectorize/masked_simd_func.ll
+++ test/Transforms/LoopVectorize/masked_simd_func.ll
@@ -0,0 +1,107 @@
+; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone.
+
+; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s
+
+; CHECK: call <8 x i32> @_ZGVdM8vlu_dowork
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 {
+entry:
+  %add = add nsw i32 %b, %k
+  %add1 = add nsw i32 %add, %c
+  ret i32 %add1
+}
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @main() local_unnamed_addr #1 {
+entry:
+  %a = alloca [4096 x i32], align 16
+  %b = alloca [4096 x i32], align 16
+  %0 = bitcast [4096 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5
+  %1 = bitcast [4096 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv39 = phi i64 [ 0, %entry ], [ %indvars.iv.next40, %for.body ]
+  %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv39
+  %2 = trunc i64 %indvars.iv39 to i32
+  store i32 %2, i32* %arrayidx, align 4, !tbaa !2
+  %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+  %exitcond41 = icmp eq i64 %indvars.iv.next40, 4096
+  br i1 %exitcond41, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3
+  %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.inc, %for.end
+  %indvars.iv36 = phi i64 [ 0, %for.end ], [ %indvars.iv.next37, %omp.inner.for.inc ]
+  %4 = trunc i64 %indvars.iv36 to i32
+  %rem = and i32 %4, 1
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %omp.inner.for.inc, label %if.then
+
+if.then:                                          ; preds = %omp.inner.for.body
+  %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv36
+  %5 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %call = tail call i32 @dowork(i32 %5, i32 %4, i32 %3), !llvm.mem.parallel_loop_access !6
+  %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv36
+  store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body, %if.then
+  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
+  %exitcond38 = icmp eq i64 %indvars.iv.next37, 4096
+  br i1 %exitcond38, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.inc
+  br label %for.body11
+
+for.body11:                                       ; preds = %for.body11, %omp.inner.for.end
+  %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ]
+  %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6)
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond, label %for.end17, label %for.body11
+
+for.end17:                                        ; preds = %for.body11
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+declare i32 @printf(i8*, ...) #3
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" }
+attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 316400)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: test/Transforms/LoopVectorize/simd_func.ll
===================================================================
--- test/Transforms/LoopVectorize/simd_func.ll
+++ test/Transforms/LoopVectorize/simd_func.ll
@@ -0,0 +1,99 @@
+; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone.
+
+; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s
+
+; CHECK: call <8 x i32> @_ZGVdN8vlu_dowork
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 {
+entry:
+  %add = add nsw i32 %b, %k
+  %add1 = add nsw i32 %add, %c
+  ret i32 %add1
+}
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @main() local_unnamed_addr #1 {
+entry:
+  %a = alloca [4096 x i32], align 16
+  %b = alloca [4096 x i32], align 16
+  %0 = bitcast [4096 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5
+  %1 = bitcast [4096 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ]
+  %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38
+  %2 = trunc i64 %indvars.iv38 to i32
+  store i32 %2, i32* %arrayidx, align 4, !tbaa !2
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096
+  br i1 %exitcond40, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3
+  %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.body, %for.end
+  %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ]
+  %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35
+  %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %5 = trunc i64 %indvars.iv35 to i32
+  %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6
+  %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35
+  store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096
+  br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.body
+  br label %for.body11
+
+for.body11:                                       ; preds = %for.body11, %omp.inner.for.end
+  %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ]
+  %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6)
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond, label %for.end17, label %for.body11
+
+for.end17:                                        ; preds = %for.body11
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+declare i32 @printf(i8*, ...) #3
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" }
+attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 316400)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: test/Transforms/LoopVectorize/simd_func_scalar.ll
===================================================================
--- test/Transforms/LoopVectorize/simd_func_scalar.ll
+++ test/Transforms/LoopVectorize/simd_func_scalar.ll
@@ -0,0 +1,111 @@
+; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone.
+
+; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s
+
+; CHECK: extractelement <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK: call i32 @dowork
+; CHECK: extractelement <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK: call i32 @dowork
+; CHECK: extractelement <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK: call i32 @dowork
+; CHECK: extractelement <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK: call i32 @dowork
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 {
+entry:
+  %add = add nsw i32 %b, %k
+  %add1 = add nsw i32 %add, %c
+  ret i32 %add1
+}
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @main() local_unnamed_addr #1 {
+entry:
+  %a = alloca [4096 x i32], align 16
+  %b = alloca [4096 x i32], align 16
+  %0 = bitcast [4096 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5
+  %1 = bitcast [4096 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ]
+  %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38
+  %2 = trunc i64 %indvars.iv38 to i32
+  store i32 %2, i32* %arrayidx, align 4, !tbaa !2
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096
+  br i1 %exitcond40, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3
+  %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.body, %for.end
+  %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ]
+  %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35
+  %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %5 = trunc i64 %indvars.iv35 to i32
+  %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6
+  %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35
+  store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096
+  br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.body
+  br label %for.body11
+
+for.body11:                                       ; preds = %for.body11, %omp.inner.for.end
+  %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ]
+  %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6)
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond, label %for.end17, label %for.body11
+
+for.end17:                                        ; preds = %for.body11
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+declare i32 @printf(i8*, ...) #3
+
+attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN8vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN8vlu_dowork,_ZGVbM8vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM8vlu_dowork" }
+attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 316400)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}