Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -740,6 +740,27 @@
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// Check legality of given SVML call instruction \p VecCall generated for
+  /// scalar call \p Call. If illegal then the appropriate legal instruction
+  /// is returned.
+  Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
+
+  /// Returns the legal VF for a call instruction \p CI using TTI information
+  /// and vector type.
+  unsigned getLegalVFForCall(CallInst *CI);
+
+  /// Partially vectorize a given call \p Call by breaking it down into multiple
+  /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
+  Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
+                              unsigned LegalVF);
+
+  /// Generate shufflevector instruction for a vector value \p V based on the
+  /// current \p Part and a smaller VF \p LegalVF.
+  Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
+
+  /// Combine partially vectorized calls stored in \p CallResults.
+  Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
+
   /// The original loop.
   Loop *OrigLoop;
 
@@ -4676,11 +4697,255 @@
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      VectorLoopValueMap.setVectorValue(&I, Part, V);
-      addMetadata(V, &I);
+      // Perform legalization of SVML call instruction only if original call
+      // was not Intrinsic
+      if (!isa<IntrinsicInst>(CI) &&
+          (V->getCalledFunction()->getName()).startswith("__svml")) {
+        LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
+        auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, CI));
+        LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
+                   LegalV->dump());
+        VectorLoopValueMap.setVectorValue(&I, Part, LegalV);
+        addMetadata(LegalV, &I);
+      } else {
+        VectorLoopValueMap.setVectorValue(&I, Part, V);
+        addMetadata(V, &I);
+      }
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Implementation of functions for SVML vector call legalization.
+//===----------------------------------------------------------------------===//
+//
+// Unlike other VECLIBs, SVML needs to be used with target-legal
+// vector types. Otherwise, link failures and/or runtime failures
+// will occur. A motivating example could be -
+//
+//   double *a;
+//   float *b;
+//   #pragma clang loop vectorize_width(8)
+//   for(i = 0; i < N; ++i) {
+//     a[i] = sin(i);   // Legal SVML VF must be 4 or below on AVX
+//     b[i] = cosf(i);  // VF can be 8 on AVX since 8 floats can fit in YMM
+//    }
+//
+// Current implementation of vector code generation in LV is
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
+// inhibits the flexibility of adjusting/choosing different VF
+// for different instructions.
+//
+// Due to this limitation it is much more straightforward to
+// first generate the illegal sin8 (svml_sin8 for SVML vector
+// library) call and then legalize it than trying to avoid
+// generating illegal code from the beginning.
+//
+// A solution for this problem is to check legality of the
+// call instruction right after generating it in vectorizer and
+// if it is illegal we split the call arguments and issue multiple
+// calls to match the legal VF. This is demonstrated currently for
+// the SVML vector library calls (non-intrinsic version only).
+//
+// Future directions and extensions:
+// 1) This legalization example shows us that a good direction
+//    for the VPlan framework would be to model the vector call
+//    instructions in a way that legal VF for each call is chosen
+//    correctly within vectorizer and illegal code generation is
+//    avoided.
+// 2) This logic can also be extended to general vector functions
+//    i.e. legalization OpenMP decalre simd functions. The
+//    requirements needed for this will be documented soon.
+
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
+                                             CallInst *Call) {
+  unsigned LegalVF = getLegalVFForCall(VecCall);
+
+  assert(LegalVF > 1 &&
+         "Legal VF for SVML call must be greater than 1 to vectorize");
+
+  if (LegalVF == VF.getKnownMinValue())
+    return VecCall;
+  else if (LegalVF > VF.getKnownMinValue())
+    // TODO: handle case when we are underfilling vectors
+    return VecCall;
+
+  // Legal VF for this SVML call is smaller than chosen VF, break it down into
+  // smaller call instructions
+
+  // Convert args, types and return type to match legal VF
+  SmallVector<Type *, 4> NewTys;
+  SmallVector<Value *, 4> NewArgs;
+  Type *NewRetTy = ToVectorTy(Call->getType(), LegalVF);
+
+  for (Value *ArgOperand : Call->arg_operands()) {
+    Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
+    NewTys.push_back(Ty);
+    NewArgs.push_back(UndefValue::get(Ty));
+  }
+
+  // Construct legal vector function
+  Function *F = Call->getCalledFunction();
+  StringRef FnName = F->getName();
+  Module *M = Call->getModule();
+  StringRef LegalVFnName = TLI->getVectorizedFunction(FnName, LegalVF);
+  assert(!LegalVFnName.empty() && (LegalVFnName != FnName) &&
+         "Could not find legal vector function in TLI.");
+
+  Function *LegalVectorF = M->getFunction(LegalVFnName);
+  if (!LegalVectorF) {
+    FunctionType *LegalFTy = FunctionType::get(NewRetTy, NewTys, false);
+    LegalVectorF =
+        Function::Create(LegalFTy, Function::ExternalLinkage, LegalVFnName, M);
+    LegalVectorF->copyAttributesFrom(F);
+  }
+  assert(LegalVectorF &&
+         "Module does not have vector version for legal SVML call.");
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Call->getOperandBundlesAsDefs(OpBundles);
+  CallInst *LegalV = CallInst::Create(LegalVectorF, NewArgs, OpBundles);
+
+  if (isa<FPMathOperator>(LegalV))
+    LegalV->copyFastMathFlags(Call);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
+
+  Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV, LegalVF);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
+
+  // Remove the illegal call from Builder
+  VecCall->eraseFromParent();
+
+  if (LegalV)
+    delete LegalV;
+
+  return LegalizedCall;
+}
+
+unsigned InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
+  const DataLayout DL = CI->getModule()->getDataLayout();
+  FunctionType *CallFT = CI->getFunctionType();
+  // All functions that need legalization should have a vector return type.
+  // This is true for all SVML functions that are currently supported.
+  assert(isa<VectorType>(CallFT->getReturnType()) &&
+         "Return type of call that needs legalization is not a vector.");
+  auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
+  Type *ElemType = VecCallRetType->getElementType();
+
+  unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
+  unsigned VectorBitWidth = TTI->getRegisterBitWidth(true);
+  unsigned LegalVF = VectorBitWidth / TypeBitWidth;
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
+
+  return LegalVF;
+}
+
+// Partial vectorization of a call instruction is achieved by making clones of
+// \p LegalCall and overwriting its argument operands with shufflevector
+// equivalent decided based on \p LegalVF and current Part being filled.
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
+                                                 CallInst *LegalCall,
+                                                 unsigned LegalVF) {
+  unsigned NumParts = VF.getKnownMinValue() / LegalVF;
+  LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
+  SmallVector<Value *, 2> CallResults;
+
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    auto *ClonedCall = cast<CallInst>(LegalCall->clone());
+
+    // Update the arg operand of cloned call to shufflevector
+    for (unsigned i = 0, ie = Call->getNumArgOperands(); i != ie; ++i) {
+      auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
+      ClonedCall->setArgOperand(i, NewOp);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
+
+    auto *PartialVecCall = Builder.Insert(ClonedCall);
+    CallResults.push_back(PartialVecCall);
+  }
+
+  return combinePartialVecCalls(CallResults);
+}
+
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
+                                                 unsigned Part) {
+  // Example:
+  // Consider the following vector code -
+  // %1 = sitofp <4 x i32> %0 to <4 x double>
+  // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
+  //
+  // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
+  // generateShuffleValue on the operand %1
+  // If Part = 1, output value is -
+  // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
+  // and if Part = 2, output is -
+  // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
+
+  assert(isa<VectorType>(V->getType()) &&
+         "Cannot generate shuffles for non-vector values.");
+  SmallVector<int, 4> ShuffleMask;
+  Value *Undef = UndefValue::get(V->getType());
+
+  unsigned ElemIdx = Part * LegalVF;
+
+  for (unsigned K = 0; K < LegalVF; K++)
+    ShuffleMask.push_back(ElemIdx + K);
+
+  auto *ShuffleInst =
+      Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
+
+  return ShuffleInst;
+}
+
+// Results of the calls executed by smaller legal call instructions must be
+// combined to match the original VF for later use. This is done by constructing
+// shufflevector instructions in a cumulative fashion.
+Value *InnerLoopVectorizer::combinePartialVecCalls(
+    SmallVectorImpl<Value *> &CallResults) {
+  assert(isa<VectorType>(CallResults[0]->getType()) &&
+         "Cannot combine calls with non-vector results.");
+  auto *CallType = cast<FixedVectorType>(CallResults[0]->getType());
+
+  Value *CombinedShuffle;
+  unsigned NumElems = CallType->getNumElements() * 2;
+  unsigned NumRegs = CallResults.size();
+
+  assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
+         "Number of partial vector calls to combine must be a power of 2 "
+         "(atleast 2^1)");
+
+  while (NumRegs > 1) {
+    for (unsigned I = 0; I < NumRegs; I += 2) {
+      SmallVector<int, 4> ShuffleMask;
+      for (unsigned J = 0; J < NumElems; J++)
+        ShuffleMask.push_back(J);
+
+      CombinedShuffle = Builder.CreateShuffleVector(
+          CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
+      LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
+                 CombinedShuffle->dump());
+      CallResults.push_back(CombinedShuffle);
+    }
+
+    SmallVector<Value *, 2>::iterator Start = CallResults.begin();
+    SmallVector<Value *, 2>::iterator End = Start + NumRegs;
+    CallResults.erase(Start, End);
+
+    NumElems *= 2;
+    NumRegs /= 2;
+  }
+
+  return CombinedShuffle;
+}
+
 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
                                                  VPUser &Operands,
                                                  bool InvariantCond,
Index: llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
+++ llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -39,7 +39,8 @@
 declare double @__exp_finite(double) #0
 
 ; CHECK-LABEL: @exp_f64
-; CHECK: <4 x double> @__svml_exp4
+; CHECK: <2 x double> @__svml_exp2
+; CHECK: <2 x double> @__svml_exp2
 ; CHECK: ret
 define void @exp_f64(double* nocapture %varray) {
 entry:
@@ -99,7 +100,8 @@
 declare double @__log_finite(double) #0
 
 ; CHECK-LABEL: @log_f64
-; CHECK: <4 x double> @__svml_log4
+; CHECK: <2 x double> @__svml_log2
+; CHECK: <2 x double> @__svml_log2
 ; CHECK: ret
 define void @log_f64(double* nocapture %varray) {
 entry:
@@ -159,7 +161,8 @@
 declare double @__pow_finite(double, double) #0
 
 ; CHECK-LABEL: @pow_f64
-; CHECK: <4 x double> @__svml_pow4
+; CHECK: <2 x double> @__svml_pow2
+; CHECK: <2 x double> @__svml_pow2
 ; CHECK: ret
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 entry:
@@ -219,7 +222,8 @@
 
 define void @exp2_finite(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_finite(
-; CHECK:    call <4 x double> @__svml_exp24(<4 x double> {{.*}})
+; CHECK:    call <2 x double> @__svml_exp22(<2 x double> {{.*}})
+; CHECK:    call <2 x double> @__svml_exp22(<2 x double> {{.*}})
 ; CHECK:    ret void
 ;
 entry:
Index: llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
@@ -0,0 +1,508 @@
+; Check legalization of SVML calls. Also checks that intrinsic calls are not legalizedby vectorizer.
+
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP1:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_sinf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x double> @__svml_sin8(<8 x double> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_sinf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP1:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_cosf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x double> @__svml_cos8(<8 x double> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_cosf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP1:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK:    [[TMP4:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x double> @__svml_pow8(<8 x double> [[TMP2:%.*]], <8 x double> [[TMP3:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_powf8(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_powf8(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_expf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x double> @__svml_exp8(<8 x double> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_expf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP1:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_logf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x double> @__svml_log8(<8 x double> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call <8 x float> @__svml_logf8(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
Index: llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
@@ -0,0 +1,61 @@
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
+; The C code used to generate this test:
+
+; #include <math.h>
+;
+; void foo(double *a, int N){
+;   int i;
+; #pragma clang loop vectorize_width(8)
+;   for (i=0;i<N;i++){
+;     a[i] = sin(i);
+;   }
+; }
+
+; RUN: opt -O2 -vector-library=SVML -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
+
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[I2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[S1]])
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[I3:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[S2]])
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp5 = icmp sgt i32 %N, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %0 to double
+  %call = tail call double @sin(double %conv) #2
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare dso_local double @sin(double) local_unnamed_addr #1
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}