Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -628,6 +628,30 @@
   /// \brief Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \brief Targets defined in the vector function ABI.
+  enum TargetProcessor {
+    Pentium4,      // ISA extension = SSE2,     ISA class = XMM
+    Pentium4SSE3,  // ISA extension = SSE3,     ISA class = XMM
+    Core2DuoSSSE3, // ISA extension = SSSE3,    ISA class = XMM
+    Core2DuoSSE41, // ISA extension = SSE4_1,   ISA class = XMM
+    CoreI7SSE42,   // ISA extension = SSE4_2,   ISA class = XMM
+    Core2ndGenAVX, // ISA extension = AVX,      ISA class = YMM1
+    Core3rdGenAVX, // ISA extension = AVX,      ISA class = YMM1
+    Core4thGenAVX, // ISA extension = AVX2,     ISA class = YMM2
+    Mic,           // ISA extension = Xeon Phi, ISA class = MIC(ZMM)
+    FutureCpu22,   // ISA extension = AVX512,   ISA class = ZMM
+    FutureCpu23,   // ISA extension = AVX512,   ISA class = ZMM
+  };
+
+  /// ISA classes defined in the vector function ABI.
+  enum ISAClass {
+    XMM,  // (SSE2)
+    YMM1, // (AVX1)
+    YMM2, // (AVX2)
+    ZMM,  // (MIC)
+    ISAClassesNum
+  };
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -887,6 +911,21 @@
                                 unsigned ChainSizeInBytes,
                                 VectorType *VecTy) const;
 
+  /// \returns The maximum vector register width based on ISAClass \p Class,
+  /// as defined in the vector function ABI.
+  unsigned maximumSizeofISAClassVectorRegister(ISAClass Class, Type *Ty) const;
+
+  /// \returns The encoded ISA class for the mangled vector variant name based
+  /// on \p IsaClass.
+  char encodeISAClass(ISAClass IsaClass) const;
+
+  /// \returns The ISAClass from the character encoded \p IsaClass of the
+  /// mangled vector variant function name.
+  ISAClass decodeISAClass(char IsaClass) const;
+
+  /// \returns The target legalized type of \P Ty based on ISAClass \p IsaClass.
+  Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const;
+
   /// Flags describing the kind of vector reduction.
   struct ReductionFlags {
     ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
@@ -1088,6 +1127,11 @@
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual unsigned maximumSizeofISAClassVectorRegister(ISAClass Class,
+                                                       Type *Ty) const = 0;
+  virtual char encodeISAClass(ISAClass IsaClass) const = 0;
+  virtual ISAClass decodeISAClass(char IsaClass) const = 0;
+  virtual Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const = 0;
   virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
@@ -1450,6 +1494,19 @@
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  unsigned maximumSizeofISAClassVectorRegister(ISAClass Class,
+                                               Type *Ty) const override {
+    return Impl.maximumSizeofISAClassVectorRegister(Class, Ty);
+  }
+  char encodeISAClass(ISAClass IsaClass) const override {
+    return Impl.encodeISAClass(IsaClass);
+  }
+  ISAClass decodeISAClass(char IsaClass) const override {
+    return Impl.decodeISAClass(IsaClass);
+  }
+  Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const override {
+    return Impl.promoteToSupportedType(Ty, IsaClass);
+  }
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const override {
     return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -529,6 +529,23 @@
     return VF;
   }
 
+  unsigned maximumSizeofISAClassVectorRegister(
+    TTI::ISAClass I, Type *Ty) const {
+    return 0;
+  }
+
+  char encodeISAClass(TTI::ISAClass IsaClass) const {
+    return '?';
+  }
+
+  TTI::ISAClass decodeISAClass(char IsaClass) const {
+    return TTI::ISAClassesNum;
+  }
+
+  Type* promoteToSupportedType(Type *Ty, TTI::ISAClass IsaClass) const {
+    return Ty;
+  }
+
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const {
     return false;
Index: include/llvm/Analysis/VectorVariant.h
===================================================================
--- include/llvm/Analysis/VectorVariant.h
+++ include/llvm/Analysis/VectorVariant.h
@@ -0,0 +1,231 @@
+//===---- llvm/Transforms/VectorVariant.h - Vector utilities -*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This header file defines the VectorVariant class and implements the encoding
+/// and decoding utilities for VectorVariant objects. Multiple VectorVariant
+/// objects can be created (masked, non-masked, etc.) and associated with the
+/// original scalar function. These objects are then used to clone new functions
+/// that can be vectorized. This class follows the standards defined in the
+/// vector function ABI.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H
+#define LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H
+
+#include <vector>
+#include <sstream>
+#include <cctype>
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define STRIDE_KIND 's'
+#define LINEAR_KIND 'l'
+#define UNIFORM_KIND 'u'
+#define VECTOR_KIND 'v'
+
+#define NOT_ALIGNED 1
+
+#define POSITIVE 1
+#define NEGATIVE -1
+
+class VectorKind {
+
+public:
+  VectorKind(char K, int S, int A = NOT_ALIGNED) {
+
+    assert((S == notAValue() || K == STRIDE_KIND || K == LINEAR_KIND) &&
+           "only linear vectors have strides");
+
+    assert((K != LINEAR_KIND || S != notAValue()) &&
+           "linear vectors must have a stride");
+
+    assert((K != STRIDE_KIND || S != notAValue()) &&
+           "variable stride vectors must have a stride");
+
+    assert((K != STRIDE_KIND || S >= 0) &&
+           "variable stride position must be non-negative");
+
+    assert(A > 0 && "alignment must be positive");
+
+    Kind = K;
+    Stride = S;
+    Alignment = A;
+  }
+
+  VectorKind(const VectorKind &Other) {
+    Kind = Other.Kind;
+    Stride = Other.Stride;
+    Alignment = Other.Alignment;
+  }
+
+  /// \brief Is the stride for a linear parameter a uniform variable? (i.e.,
+  /// the stride is stored in a variable but is uniform)
+  bool isVariableStride() { return Kind == STRIDE_KIND; }
+
+  /// \brief Is the stride for a linear variable non-unit stride?
+  bool isNonUnitStride() { return Kind == LINEAR_KIND && Stride != 1; }
+
+  /// \brief Is the stride for a linear variable unit stride?
+  bool isUnitStride() { return Kind == LINEAR_KIND && Stride == 1; }
+
+  /// \brief Is this a linear parameter?
+  bool isLinear() {
+    return isVariableStride() || isNonUnitStride() || isUnitStride();
+  }
+
+  /// \brief Is this a uniform parameter?
+  bool isUniform() { return Kind == UNIFORM_KIND; }
+
+  /// \brief Is this a vector parameter?
+  bool isVector() { return Kind == VECTOR_KIND; }
+
+  /// \brief Is the parameter aligned?
+  bool isAligned() { return Alignment != NOT_ALIGNED; }
+
+  /// \brief Get the stride associated with a linear parameter.
+  int getStride() { return Stride; }
+
+  /// \brief Get the alignment associated with a linear parameter.
+  int getAlignment() { return Alignment; }
+
+  /// \brief Represents a don't care value for strides of parameters other
+  /// than linear parameters.
+  static int notAValue() { return -1; }
+
+  /// \brief Encode the parameter information into a mangled string
+  /// corresponding to the standards defined in the vector function ABI.
+  std::string encode() {
+    std::stringstream SST;
+    SST << Kind;
+
+    if (isNonUnitStride()) {
+      if (Stride >= 0)
+        SST << Stride;
+      else
+        SST << "n" << -Stride;
+    }
+
+    if (isVariableStride())
+      SST << Stride;
+
+    if (isAligned())
+      SST << 'a' << Alignment;
+
+    return SST.str();
+  }
+
+private:
+  char Kind;      // linear, uniform, vector
+  int  Stride;
+  int  Alignment;
+};
+
+class VectorVariant {
+
+private:
+  TargetTransformInfo *TTI;
+  TargetTransformInfo::ISAClass Isa;
+  bool Mask;
+  unsigned int Vlen;
+  std::vector<VectorKind> Parameters;
+
+  static std::string prefix() { return "_ZGV"; }
+
+public:
+  VectorVariant(StringRef FuncName, TargetTransformInfo *TTI);
+
+  /// \brief Get the ISA corresponding to this vector variant.
+  TargetTransformInfo::ISAClass getISA() { return Isa; }
+
+  /// \brief Is this a masked vector function variant?
+  bool isMasked() { return Mask; }
+
+  /// \brief Get the vector length of the vector variant.
+  unsigned int getVlen() { return Vlen; }
+
+  /// \brief Get the parameters of the vector variant.
+  std::vector<VectorKind> &getParameters() { return Parameters; }
+
+  /// \brief Build the mangled name for the vector variant. This function
+  /// builds a mangled name by including the encodings for the ISA class,
+  /// mask information, and all parameters.
+  std::string encode() {
+
+    std::stringstream SST;
+    SST << prefix() << TTI->encodeISAClass(Isa) << encodeMask(Mask) << Vlen;
+
+    std::vector<VectorKind>::iterator It = Parameters.begin();
+    std::vector<VectorKind>::iterator End = Parameters.end();
+
+    if (isMasked())
+      End--; // mask parameter is not encoded
+
+    for (; It != End; ++It)
+      SST << (*It).encode();
+
+    SST << "_";
+
+    return SST.str();
+  }
+
+  /// \brief Generate a function name corresponding to a vector variant.
+  std::string generateFunctionName(StringRef ScalarFuncName) {
+
+    static StringRef ManglingPrefix("_Z");
+    std::string Name = encode();
+
+    if (ScalarFuncName.startswith(ManglingPrefix))
+      return Name + ScalarFuncName.drop_front(ManglingPrefix.size()).str();
+    else
+      return Name + ScalarFuncName.str();
+  }
+
+  /// \brief Some targets do not support particular types, so promote to a type
+  /// that is supported.
+  Type *promoteToSupportedType(Type *Ty) {
+    return TTI->promoteToSupportedType(Ty, getISA());
+  }
+
+  /// \brief Check to see if this is a vector variant based on the function
+  /// name.
+  static bool isVectorVariant(StringRef FuncName) {
+    return FuncName.startswith(prefix());
+  }
+
+  /// \brief Encode the mask information for the mangled variant name.
+  static char encodeMask(bool EncodeMask) {
+
+    return EncodeMask ? 'M' : 'N';
+  }
+
+  /// \brief Decode the mask information from the mangled variant name.
+  static bool decodeMask(char MaskToDecode) {
+
+    switch (MaskToDecode) {
+    case 'M':
+      return true;
+    case 'N':
+      return false;
+    }
+
+    llvm_unreachable("unsupported mask");
+  }
+
+  /// \brief Calculate the vector length for the vector variant.
+  unsigned calcVlen(TargetTransformInfo::ISAClass I, Type *Ty);
+};
+
+#endif // LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -377,6 +377,7 @@
 void initializeWriteBitcodePassPass(PassRegistry&);
 void initializeWriteThinLTOBitcodePass(PassRegistry&);
 void initializeXRayInstrumentationPass(PassRegistry&);
+void initializeVecClonePass(PassRegistry&);
 
 } // end namespace llvm
 
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/VecClone.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <cstdlib>
 
@@ -207,6 +208,7 @@
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
+      (void) llvm::createVecClonePass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
Index: include/llvm/Transforms/Utils/VecClone.h
===================================================================
--- include/llvm/Transforms/Utils/VecClone.h
+++ include/llvm/Transforms/Utils/VecClone.h
@@ -0,0 +1,236 @@
+//===-------------- VecClone.h - Class definition -*- C++ -*---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===--------------------------------------------------------------------=== //
+///
+/// \file
+/// This file defines the VecClone pass class.
+///
+// ===--------------------------------------------------------------------=== //
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Analysis/VectorVariant.h"
+
+#ifndef LLVM_TRANSFORMS_VPO_VECCLONE_H
+#define LLVM_TRANSFORMS_VPO_VECCLONE_H
+
+enum InstType {
+  ALLOCA = 0,
+  STORE,
+  BITCAST
+};
+
+namespace llvm {
+
+class ModulePass;
+
+/// \brief Represents the mapping of a vector parameter to its corresponding
+/// vector to scalar type cast instruction. This done so that the scalar loop
+/// inserted by this pass contains instructions that are in scalar form so that
+/// the loop can later be vectorized.
+struct ParmRef {
+  // Represents the parameter in one of two forms:
+  // 1) A vector alloca instruction if the parameter has not been registerized.
+  // 2) The parameter as the Value* passed in via the function call.
+  Value *VectorParm;
+
+  // Represents the vector parameter cast from a vector type to scalar type.
+  Instruction *VectorParmCast;
+};
+
+class VecClone : public ModulePass {
+
+  private:
+
+    /// \brief Contains the names of the declared vector function variants
+    typedef std::vector<std::string> DeclaredVariants;
+
+    /// \brief Contains a mapping of a function to its vector function variants
+    typedef std::map<Function*, DeclaredVariants> FunctionVariants;
+
+    /// \brief Determine the characteristic type of the vector function as
+    /// specified according to the vector function ABI.
+    Type* calcCharacteristicType(Function& F, VectorVariant& Variant);
+
+    /// \brief Get all functions marked for vectorization in module and their
+    /// list of variants.
+    void getFunctionsToVectorize(
+      Module &M, std::map<Function*, std::vector<StringRef> > &FuncVars);
+
+    /// \brief Returns a floating point or integer constant depending on Ty.
+    template <typename T>
+    Constant* getConstantValue(Type *Ty, LLVMContext &Context, T Val);
+
+    /// \brief Return true if the function has a complex type for the return
+    /// or parameters.
+    bool hasComplexType(Function *F);
+
+    /// \brief Make a copy of the function if it is marked as SIMD.
+    Function* CloneFunction(Function &F, VectorVariant &V);
+
+    /// \brief Take the entry basic block for the function as split off a second
+    /// basic block that will form the loop entry.
+    BasicBlock* splitEntryIntoLoop(Function *Clone, VectorVariant &V,
+                                   BasicBlock *EntryBlock);
+
+    /// \brief Take the loop entry basic block and split off a second basic
+    /// block into a new return basic block.
+    BasicBlock* splitLoopIntoReturn(Function *Clone, BasicBlock *LoopBlock);
+
+    /// \brief Generate a basic block to test the loop exit condition.
+    BasicBlock* createLoopExit(Function *Clone, BasicBlock *ReturnBlock);
+
+    /// \brief Update the predecessors of the return basic block.
+    void updateReturnPredecessors(Function *Clone, BasicBlock *LoopExitBlock,
+                                  BasicBlock *ReturnBlock);
+
+    /// \brief Create the backedge from the loop exit basic block to the loop
+    /// entry block.
+    PHINode* createPhiAndBackedgeForLoop(Function *Clone,
+                                         BasicBlock *EntryBlock,
+                                         BasicBlock *LoopBlock,
+                                         BasicBlock *LoopExitBlock,
+                                         BasicBlock *ReturnBlock,
+                                         int VL);
+
+    /// \brief Generate vector alloca instructions for vector parameters and
+    /// change the parameter types to vector types. Expand the return value of
+    /// the function to a vector type. This function returns the instruction
+    /// corresponding to the expanded return and the instruction corresponding
+    /// to the mask.
+    Instruction* expandVectorParametersAndReturn(
+        Function *Clone,
+        VectorVariant &V,
+        Instruction **Mask,
+        BasicBlock *EntryBlock,
+        BasicBlock *LoopBlock,
+        BasicBlock *ReturnBlock,
+        std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Expand the function parameters to vector types. This function
+    /// returns the instruction corresponding to the mask.
+    Instruction* expandVectorParameters(
+        Function *Clone,
+        VectorVariant &V,
+        BasicBlock *EntryBlock,
+        std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Expand the function's return value to a vector type.
+    Instruction* expandReturn(Function *Clone, BasicBlock *EntryBlock,
+                             BasicBlock *LoopBlock, BasicBlock *ReturnBlock,
+                             std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Update the old parameter references to with the new vector
+    /// references.
+    void updateScalarMemRefsWithVector(
+        Function *Clone,
+        Function &F,
+        BasicBlock *EntryBlock,
+        BasicBlock *ReturnBlock,
+        PHINode *Phi,
+        std::vector<ParmRef*>& ParmMap);
+        
+    /// \brief Update the values of linear parameters by adding the stride
+    /// before the use.
+    void updateLinearReferences(Function *Clone, Function &F,
+                                VectorVariant &V, PHINode *Phi);
+
+    /// \brief Update the instructions in the return basic block to return a
+    /// vector temp.
+    void updateReturnBlockInstructions(Function *Clone, BasicBlock *ReturnBlock,
+                                       Instruction *VecReturnAlloca);
+
+    /// \brief Create a separate basic block to mark the begin and end of the
+    /// SIMD loop formed from the vector function. Essentially, this function
+    /// transfers the information from the SIMD function keywords and creates
+    /// new loop pragmas so that parameter information can be transferred to
+    /// the loop.
+    void insertDirectiveIntrinsics(Module& M, Function *Clone, Function &F,
+                                   VectorVariant &V,
+                                   BasicBlock *EntryBlock, 
+                                   BasicBlock *LoopExitBlock,
+                                   BasicBlock *ReturnBlock);
+
+    /// \brief Create the basic block indicating the begin of the SIMD loop.
+    void insertBeginRegion(Module& M, Function *Clone, Function &F,
+                           VectorVariant &V, BasicBlock *EntryBlock);
+
+    /// \brief Create the basic block indicating the end of the SIMD loop.
+    void insertEndRegion(Module& M, Function *Clone, BasicBlock *LoopExitBlock,
+                         BasicBlock *ReturnBlock);
+
+    /// \brief Create a new vector alloca instruction for the return vector and
+    /// bitcast to the appropriate element type.
+    Instruction* createExpandedReturn(Function *F, BasicBlock *BB,
+                                      VectorType *ReturnType);
+
+    /// \brief Return the position of the parameter in the function's parameter
+    /// list.
+    int getParmIndexInFunction(Function *F, Value *Parm);
+
+    /// \brief Check to see if the function is simple enough that a loop does
+    /// not need to be inserted into the function.
+    bool isSimpleFunction(Function *Clone, VectorVariant &V,
+                          ReturnInst *Return);
+
+    /// \brief Inserts the if/else split and mask condition for masked SIMD
+    /// functions.
+    void insertSplitForMaskedVariant(Function *Clone, BasicBlock *LoopBlock,
+                                     BasicBlock *LoopExitBlock,
+                                     Instruction *Mask, PHINode *Phi);
+
+    /// \brief Utility function to insert instructions with other instructions
+    /// of the same kind.
+    void insertInstruction(Instruction *Inst, BasicBlock *BB);
+
+    /// \brief Utility function that generates instructions that calculate the
+    /// stride for a linear parameter.
+    Instruction* generateStrideForParameter(Function *Clone, Argument *Arg,
+                                            Instruction *ParmUser, int Stride,
+                                            PHINode *Phi);
+
+    /// \brief Utility function that returns true if Inst is a store of a vector
+    /// or linear parameter.
+    bool isVectorOrLinearParamStore(Function *Clone,
+                                    std::vector<VectorKind> &ParmKinds,
+                                    Instruction *Inst);
+
+    /// \brief Removes the original scalar alloca instructions that correspond
+    /// to a vector parameter before widening.
+    void removeScalarAllocasForVectorParams(
+      std::vector<ParmRef*> &VectorParmMap);
+
+    /// \brief Adds metadata to the conditional branch of the simd loop latch to
+    /// prevent loop unrolling.
+    void disableLoopUnrolling(BasicBlock *Latch);
+
+    /// \brief Check to see that the type of the gep used for a load instruction
+    /// is compatible with the type needed as the result of the load. Basically,
+    /// check the validity of the LLVM IR to make sure that proper pointer
+    /// dereferencing is done.
+    bool typesAreCompatibleForLoad(Type *GepType, Type *LoadType);
+
+    bool runOnModule(Module &M) override;
+
+  public:
+
+    static char ID;
+    VecClone();
+    void print(raw_ostream &OS, const Module * = nullptr) const override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+}; // end pass class
+
+ModulePass *createVecClonePass();
+
+} // end llvm namespace
+
+#endif // LLVM_TRANSFORMS_VPO_VECCLONE_H
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -84,6 +84,7 @@
   ValueLatticeUtils.cpp
   ValueTracking.cpp
   VectorUtils.cpp
+  VectorVariant.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -584,6 +584,26 @@
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+unsigned TargetTransformInfo::maximumSizeofISAClassVectorRegister(
+    ISAClass I, Type *Ty) const {
+
+  return TTIImpl->maximumSizeofISAClassVectorRegister(I, Ty);
+}
+
+char TargetTransformInfo::encodeISAClass(ISAClass IsaClass) const {
+  return TTIImpl->encodeISAClass(IsaClass);
+}
+
+TargetTransformInfo::ISAClass TargetTransformInfo::decodeISAClass(
+    char IsaClass) const {
+  return TTIImpl->decodeISAClass(IsaClass);
+}
+
+Type* TargetTransformInfo::promoteToSupportedType(Type *Ty,
+                                                  ISAClass IsaClass) const {
+  return TTIImpl->promoteToSupportedType(Ty, IsaClass);
+}
+
 bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
                                                 Type *Ty, ReductionFlags Flags) const {
   return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
Index: lib/Analysis/VectorVariant.cpp
===================================================================
--- lib/Analysis/VectorVariant.cpp
+++ lib/Analysis/VectorVariant.cpp
@@ -0,0 +1,112 @@
+//===---------- VectorVariant.cpp - Vector function ABI -*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VectorVariant class and corresponding utilities.
+/// VectorVariant objects are associated with a scalar function and are used
+/// to generate new functions that can be vectorized. VectorVariants are
+/// determined by inspecting the function attributes associated with the scalar
+/// function. When a mangled function name is found in the attributes (indicated
+/// as "_ZGV"), a VectorVariant object is created. The class and utilities
+/// in this file follow the standards defined in the vector function ABI.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/VectorVariant.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Generate a vector variant by decoding the mangled string for the
+/// variant contained in the original scalar function's attributes. For
+/// example: "_ZGVxN4". The name mangling is defined in the vector function
+/// ABI. Based on this string, the parameter kinds (uniform, linear, vector),
+/// vector length, parameter alignment, and masking are determined.
+VectorVariant::VectorVariant(StringRef FuncName,
+                             TargetTransformInfo *TTI) : TTI(TTI) {
+
+  assert(isVectorVariant(FuncName) && "invalid vector variant format");
+
+  std::stringstream SST(FuncName.drop_front(prefix().size()));
+
+  // mandatory annotations
+  char EncodedISA;
+  SST.get(EncodedISA);
+  Isa = TTI->decodeISAClass(EncodedISA);
+
+  char EncodedMask;
+  SST.get(EncodedMask);
+  Mask = decodeMask(EncodedMask);
+  SST >> Vlen;
+
+  // optional parameter annotations
+  while (SST.peek() != '_') {
+
+    char Kind;
+    int Stride = VectorKind::notAValue();
+    int StrideSign = POSITIVE;
+    int Alignment = NOT_ALIGNED;
+
+    // Get parameter kind
+    SST.get(Kind);
+
+    // Default stride for linear is 1. If the stride for a parameter is 1,
+    // then the front-end will not encode it and we will not have set the
+    // correct stride below.
+    if (Kind == LINEAR_KIND)
+      Stride = 1;
+
+    // Handle optional stride
+    if (SST.peek() == 'n') {
+      // Stride is negative
+      SST.ignore(1);
+      StrideSign = NEGATIVE;
+    }
+
+    if (std::isdigit(SST.peek())) {
+      // Extract constant stride
+      SST >> Stride;
+      assert((Kind != STRIDE_KIND || Stride >= 0) &&
+             "variable stride argument index cannot be negative");
+    }
+
+    Stride *= StrideSign;
+    // Handle optional alignment
+    if (SST.peek() == 'a') {
+      SST.ignore(1);
+      SST >> Alignment;
+    }
+
+    VectorKind VecKind(Kind, Stride, Alignment);
+    Parameters.push_back(VecKind);
+  }
+
+  if (Mask) {
+    // Masked variants will have an additional mask parameter
+    VectorKind VecKind(VECTOR_KIND, VectorKind::notAValue());
+    Parameters.push_back(VecKind);
+  }
+}
+
+/// \brief Determine the vector variant's vector length based on the
+/// characteristic data type defined in the vector function ABI and target
+/// vector register width.
+unsigned int VectorVariant::calcVlen(TargetTransformInfo::ISAClass I,
+				     Type* CharacteristicDataType) {
+  assert(CharacteristicDataType &&
+	 CharacteristicDataType->getPrimitiveSizeInBits() != 0 &&
+	 "expected characteristic data type to have a primitive size in bits");
+
+  unsigned int VectorRegisterSize =
+    TTI->maximumSizeofISAClassVectorRegister(I, CharacteristicDataType);
+
+  return VectorRegisterSize / CharacteristicDataType->getPrimitiveSizeInBits();
+}
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -129,6 +129,12 @@
                            const Function *Callee) const;
   bool enableMemCmpExpansion(unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
+
+  unsigned maximumSizeofISAClassVectorRegister(TTI::ISAClass IsaClass,
+                                               Type *Ty) const;
+  char encodeISAClass(TTI::ISAClass IsaClass) const;
+  TargetTransformInfo::ISAClass decodeISAClass(char IsaClass) const;
+  Type* promoteToSupportedType(Type *Ty, TTI::ISAClass IsaClass) const;
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
                       unsigned Alignment, unsigned AddressSpace);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2808,3 +2808,83 @@
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
+
+unsigned X86TTIImpl::maximumSizeofISAClassVectorRegister(TTI::ISAClass I,
+                                                         Type *Ty) const {
+
+  assert((Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() ||
+          Ty->isPointerTy()) &&
+         "unsupported type");
+
+  unsigned int VectorRegisterSize = 0;
+
+  switch (I) {
+  case TTI::XMM:
+    VectorRegisterSize = 128;
+    break;
+  case TTI::YMM1:
+    if (Ty->isIntegerTy() || Ty->isPointerTy())
+      VectorRegisterSize = 128;
+    else
+      VectorRegisterSize = 256;
+    break;
+  case TTI::YMM2:
+    if (Ty->isIntegerTy(8))
+      VectorRegisterSize = 128;
+    else
+      VectorRegisterSize = 256;
+    break;
+  case TTI::ZMM:
+    VectorRegisterSize = 512;
+    break;
+  default:
+    llvm_unreachable("unknown isa class");
+    return 0;
+  }
+
+  assert(VectorRegisterSize != 0 && "unsupported ISA/type combination");
+  return VectorRegisterSize;
+}
+
+char X86TTIImpl::encodeISAClass(TTI::ISAClass IsaClass) const {
+  switch (IsaClass) {
+  case TTI::XMM:
+    return 'b';
+  case TTI::YMM1:
+    return 'c';
+  case TTI::YMM2:
+    return 'd';
+  case TTI::ZMM:
+    return 'e';
+  default:
+    break;
+  }
+
+  assert(false && "unsupported ISA class");
+  return '?';
+}
+
+TargetTransformInfo::ISAClass X86TTIImpl::decodeISAClass(char IsaClass) const {
+  switch (IsaClass) {
+  case 'b':
+    return TTI::XMM;
+  case 'c':
+    return TTI::YMM1;
+  case 'd':
+    return TTI::YMM2;
+  case 'e':
+    return TTI::ZMM;
+  default:
+    llvm_unreachable("unsupported ISA class");
+    return TTI::XMM;
+  }
+}
+
+Type* X86TTIImpl::promoteToSupportedType(Type *Ty, TTI::ISAClass I) const {
+  // On ZMM promote char and short to int
+  if (I == TargetTransformInfo::ISAClass::ZMM && (Ty->isIntegerTy(8) ||
+      Ty->isIntegerTy(16)))
+    return Type::getInt32Ty(Ty->getContext());
+
+  return Ty;
+}
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Utils/VecClone.h"
 
 using namespace llvm;
 
@@ -94,6 +95,9 @@
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
+static cl::opt<bool> RunVecClone("enable-vec-clone", cl::init(false), cl::Hidden,
+                                 cl::desc("Run Vector Function Cloning"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -426,6 +430,10 @@
     // new unnamed globals.
     if (PrepareForThinLTO)
       MPM.add(createNameAnonGlobalPass());
+
+    if (RunVecClone)
+      MPM.add(createVecClonePass());
+
     return;
   }
 
@@ -588,6 +596,9 @@
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   MPM.add(createLoopDistributePass());
 
+  if (RunVecClone)
+    MPM.add(createVecClonePass());
+
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
Index: lib/Transforms/Utils/CMakeLists.txt
===================================================================
--- lib/Transforms/Utils/CMakeLists.txt
+++ lib/Transforms/Utils/CMakeLists.txt
@@ -52,6 +52,7 @@
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+  VecClone.cpp
   VNCoercion.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: lib/Transforms/Utils/VecClone.cpp
===================================================================
--- lib/Transforms/Utils/VecClone.cpp
+++ lib/Transforms/Utils/VecClone.cpp
@@ -0,0 +1,1727 @@
+//=------- VecClone.cpp - Vector function to loop transform -*- C++ -*-------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===--------------------------------------------------------------------=== //
+///
+/// \file
+/// This pass inserts the body of a vector function inside a vector length
+/// trip count scalar loop for functions that are declared SIMD. The pass
+/// currently follows the gcc vector ABI requirements for name mangling
+/// encodings, but will be extended in the future to also support the Intel
+/// vector ABI. References to both ABIs can be found here:
+///
+/// https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
+/// https://software.intel.com/sites/default/files/managed/b4/c8/Intel-Vector-Function-ABI.pdf
+///
+/// Conceptually, this pass performs the following transformation:
+///
+/// Before Translation:
+///
+/// main.cpp
+///
+/// #pragma omp declare simd uniform(a) linear(k)
+/// extern float dowork(float *a, float b, int k);
+///
+/// float a[4096];
+/// float b[4096];
+/// int main() {
+///   int k;
+///   for (k = 0; k < 4096; k++) {
+///     b[k] = k;
+///   }
+/// #pragma clang loop vectorize(enable)
+///   for (k = 0; k < 4096; k++) {
+///     a[k] = k * 0.5;
+///     a[k] = dowork(a, b[k], k);
+///   }
+/// }
+///
+/// dowork.cpp
+///
+/// #pragma omp declare simd uniform(a) linear(k) #0
+/// float dowork(float *a, float b, int k) {
+///   return sinf(a[k]) + b;
+/// }
+///
+/// attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4uvl_",
+/// "ZGVbN4uvl_", ... }
+///
+/// After Translation:
+///
+/// dowork.cpp
+///
+/// // Non-masked variant
+///
+/// <VL x float> "_ZGVbN4uvl_dowork(float *a, <VL x float> b, int k) {
+///   alloc <VL x float> vec_ret;
+///   alloc <VL x float> vec_b;
+///   // casts from vector to scalar pointer allows loop to be in a scalar form
+///   // that can be vectorized easily.
+///   ret_cast = bitcast <VL x float>* vec_ret to float*;
+///   vec_b_cast = bitcast <VL x float>* vec_b to float*;
+///   store <VL x float> b, <VL x float>* vec_b;
+///   for (int i = 0; i < VL; i++, k++) {
+///     ret_cast[i] = sinf(a[k]) + vec_b_cast[i];
+///   }
+///   return vec_ret;
+/// }
+///
+/// // Masked variant
+///
+/// <VL x float> "_ZGVbM4uvl_dowork(float *a, <VL x float> b, int k, <VL x int>
+/// mask) {
+///   alloc <VL x float> vec_ret;
+///   alloc <VL x float> vec_b;
+///   ret_cast = bitcast <VL x float>* vec_ret to float*;
+///   vec_b_cast = bitcast <VL x float>* vec_b to float*;
+///   store <VL x float> b, <VL x float>* vec_b;
+///   for (int i = 0; i < VL; i++, k++) {
+///     if (mask[i] != 0)
+///       ret_cast[i] = sinf(a[k]) + vec_b_cast[i];
+///   }
+///   return vec_ret;
+/// }
+///
+// ===--------------------------------------------------------------------=== //
+
+// This pass is flexible enough to recognize whether or not parameters have been
+// registerized so that the users of the parameter can be properly updated. For
+// instance, we need to know where the users of linear parameters are so that
+// the stride can be added to them.
+//
+// In the following example, %i and %x are used directly by %add directly, so
+// in this case the pass can just look for users of %i and %x.
+//
+// define i32 @foo(i32 %i, i32 %x) #0 {
+// entry:
+//   %add = add nsw i32 %x, %i
+//   ret i32 %add
+// }
+//
+// When parameters have not been registerized, parameters are used indirectly
+// through a store/load of the parameter to/from memory that has been allocated
+// for them in the function. Thus, in this case, the pass looks for users of
+// %0 and %1.
+//
+// define i32 @foo(i32 %i, i32 %x) #0 {
+// entry:
+// %i.addr = alloca i32, align 4
+// %x.addr = alloca i32, align 4
+// store i32 %i, i32* %i.addr, align 4
+// store i32 %x, i32* %x.addr, align 4
+// %0 = load i32, i32* %x.addr, align 4
+// %1 = load i32, i32* %i.addr, align 4
+// %add = add nsw i32 %0, %1
+//  ret i32 %add
+// }
+//
+// The pass must run at all optimization levels because it is possible that
+// a loop calling the vector function is vectorized, but the vector function
+// itself is not vectorized. For example, above main.cpp may be compiled at
+// -O2, but dowork.cpp may be compiled at -O0. Therefore, it is required that
+// the attribute list for the vector function specify all variants that must
+// be generated by this pass so as to avoid any linking problems. This pass
+// also serves to canonicalize the input IR to the loop vectorizer.
+
+#include "llvm/Transforms/Utils/VecClone.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/VectorVariant.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+#define SV_NAME "vec-clone"
+#define DEBUG_TYPE "VecClone"
+
+using namespace llvm;
+
+VecClone::VecClone() : ModulePass(ID) {}
+
+void VecClone::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+Type *VecClone::calcCharacteristicType(Function &F, VectorVariant &Variant) {
+  Type *ReturnType = F.getReturnType();
+  Type *CharacteristicDataType = nullptr;
+
+  if (!ReturnType->isVoidTy())
+    CharacteristicDataType = ReturnType;
+
+  if (!CharacteristicDataType) {
+
+    std::vector<VectorKind> &ParmKinds = Variant.getParameters();
+    Function::const_arg_iterator ArgIt = F.arg_begin();
+    Function::const_arg_iterator ArgEnd = F.arg_end();
+    std::vector<VectorKind>::iterator VKIt = ParmKinds.begin();
+
+    for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) {
+      if (VKIt->isVector()) {
+        CharacteristicDataType = (*ArgIt).getType();
+        break;
+      }
+    }
+  }
+
+  // TODO except Clang's ComplexType
+  if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) {
+    CharacteristicDataType = Type::getInt32Ty(F.getContext());
+  }
+
+  // Legalize the characteristic type based on target requirements.
+  CharacteristicDataType =
+      Variant.promoteToSupportedType(CharacteristicDataType);
+
+  if (CharacteristicDataType->isPointerTy()) {
+    // For such cases as 'int* foo(int x)', where x is a non-vector type, the
+    // characteristic type at this point will be i32*. If we use the DataLayout
+    // to query the supported pointer size, then a promotion to i64* is
+    // incorrect because the mask element type will mismatch the element type
+    // of the characteristic type.
+    PointerType *PointerTy = cast<PointerType>(CharacteristicDataType);
+    CharacteristicDataType = PointerTy->getElementType();
+  }
+
+  return CharacteristicDataType;
+}
+
+void VecClone::getFunctionsToVectorize(
+    llvm::Module &M, std::map<Function *, std::vector<StringRef>> &FuncVars) {
+
+  // FuncVars will contain a 1-many mapping between the original scalar
+  // function and the vector variant encoding strings (represented as
+  // attributes). The encodings correspond to functions that will be created by
+  // the caller of this function as vector versions of the original function.
+  // For example, if foo() is a function marked as a simd function, it will have
+  // several vector variant encodings like: "_ZGVbM4_foo", "_ZGVbN4_foo",
+  // "_ZGVcM8_foo", "_ZGVcN8_foo", "_ZGVdM8_foo", "_ZGVdN8_foo", "_ZGVeM16_foo",
+  // "_ZGVeN16_foo". The caller of this function will then clone foo() and name
+  // the clones using the above name manglings. The variant encodings correspond
+  // to differences in masked/non-masked execution, vector length, and target
+  // vector register size, etc. For more details, please refer to the following
+  // reference for details on the vector function encodings.
+  // https://www.cilkplus.org/sites/default/files/open_specifications/
+  // Intel-ABI-Vector-Function-2012-v0.9.5.pdf
+
+  for (auto It = M.begin(), End = M.end(); It != End; ++It) {
+    Function &F = *It;
+    if (F.hasFnAttribute("vector-variants")) {
+      Attribute Attr = F.getFnAttribute("vector-variants");
+      StringRef VariantsStr = Attr.getValueAsString();
+      SmallVector<StringRef, 8> Variants;
+      VariantsStr.split(Variants, ',');
+      for (unsigned i = 0; i < Variants.size(); i++) {
+        FuncVars[&F].push_back(Variants[i]);
+      }
+    }
+  }
+}
+
+template Constant *
+VecClone::getConstantValue<int>(Type *Ty, LLVMContext &Context, int Val);
+template Constant *
+VecClone::getConstantValue<float>(Type *Ty, LLVMContext &Context, float Val);
+template Constant *
+VecClone::getConstantValue<double>(Type *Ty, LLVMContext &Context, double Val);
+template <typename T>
+Constant *VecClone::getConstantValue(Type *Ty, LLVMContext &Context, T Val) {
+  Constant *ConstVal = nullptr;
+
+  if (Ty->isIntegerTy()) {
+    ConstVal = ConstantInt::get(Ty, Val);
+  } else if (Ty->isFloatTy()) {
+    ConstVal = ConstantFP::get(Ty, Val);
+  }
+
+  assert(ConstVal && "Could not generate constant for type");
+
+  return ConstVal;
+}
+
+void VecClone::insertInstruction(Instruction *Inst, BasicBlock *BB) {
+  // This function inserts instructions in a way that groups like instructions
+  // together for debuggability/readability purposes. This was designed to make
+  // the entry basic block easier to read since this pass creates/modifies
+  // alloca, store, and bitcast instructions for each vector parameter and
+  // return. Thus, this function ensures all allocas are grouped together, all
+  // stores are grouped together, and so on. If the type of instruction passed
+  // in does not exist in the basic block, then it is added to the end of the
+  // basic block, just before the terminator instruction.
+
+  BasicBlock::reverse_iterator BBIt = BB->rbegin();
+  BasicBlock::reverse_iterator BBEnd = BB->rend();
+  BasicBlock::iterator AnchorInstIt = BB->end();
+  AnchorInstIt--;
+  Instruction *Anchor = &*AnchorInstIt;
+
+  for (; BBIt != BBEnd; ++BBIt) {
+    if (Inst->getOpcode() == (&*BBIt)->getOpcode()) {
+      Anchor = &*BBIt;
+      break;
+    }
+  }
+
+  if (isa<BranchInst>(Anchor)) {
+    Inst->insertBefore(Anchor);
+  } else {
+    Inst->insertAfter(Anchor);
+  }
+}
+
+bool VecClone::hasComplexType(Function *F) {
+  Function::arg_iterator ArgListIt = F->arg_begin();
+  Function::arg_iterator ArgListEnd = F->arg_end();
+
+  for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+    // Complex types for parameters/return come in as vector.
+    if (ArgListIt->getType()->isVectorTy()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Function *VecClone::CloneFunction(Function &F, VectorVariant &V) {
+
+  DEBUG(dbgs() << "Cloning Function: " << F.getName() << "\n");
+  DEBUG(F.dump());
+
+  FunctionType *OrigFunctionType = F.getFunctionType();
+  Type *ReturnType = F.getReturnType();
+  Type *CharacteristicType = calcCharacteristicType(F, V);
+
+  // Expand return type to vector.
+  if (!ReturnType->isVoidTy())
+    ReturnType = VectorType::get(ReturnType, V.getVlen());
+
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+  SmallVector<Type *, 4> ParmTypes;
+  FunctionType::param_iterator ParmIt = OrigFunctionType->param_begin();
+  FunctionType::param_iterator ParmEnd = OrigFunctionType->param_end();
+  std::vector<VectorKind>::iterator VKIt = ParmKinds.begin();
+  for (; ParmIt != ParmEnd; ++ParmIt, ++VKIt) {
+    if (VKIt->isVector())
+      ParmTypes.push_back(
+          VectorType::get((*ParmIt)->getScalarType(), V.getVlen()));
+    else
+      ParmTypes.push_back(*ParmIt);
+  }
+
+  if (V.isMasked()) {
+    Type *MaskVecTy = VectorType::get(CharacteristicType, V.getVlen());
+    ParmTypes.push_back(MaskVecTy);
+  }
+
+  FunctionType *CloneFuncType = FunctionType::get(ReturnType, ParmTypes, false);
+
+  std::string VariantName = V.generateFunctionName(F.getName());
+  Function *Clone = Function::Create(
+      CloneFuncType, GlobalValue::ExternalLinkage, VariantName, F.getParent());
+
+  // Remove vector variant attributes from the original function. They are
+  // not needed for the cloned function and it prevents any attempts at
+  // trying to clone the function again in case the pass is called more than
+  // once.
+  F.removeFnAttr("vector-variants");
+
+  Function::arg_iterator ArgIt = F.arg_begin();
+  Function::arg_iterator ArgEnd = F.arg_end();
+  ValueToValueMapTy Vmap;
+  Function::arg_iterator NewArgIt = Clone->arg_begin();
+  for (; ArgIt != ArgEnd; ++ArgIt, ++NewArgIt) {
+    NewArgIt->setName(ArgIt->getName());
+    Vmap[&*ArgIt] = &*NewArgIt;
+  }
+
+  if (V.isMasked()) {
+    Argument &MaskArg = *NewArgIt;
+    MaskArg.setName("mask");
+  }
+
+  SmallVector<ReturnInst *, 8> Returns;
+  CloneFunctionInto(Clone, &F, Vmap, true, Returns);
+
+  // Remove incompatible argument attributes (applied to the scalar argument,
+  // does not apply to its vector counterpart). This must be done after cloning
+  // the function because CloneFunctionInto() transfers parameter attributes
+  // from the original parameters in the Vmap.
+  ArgIt = Clone->arg_begin();
+  ArgEnd = Clone->arg_end();
+  AttrBuilder AB;
+  for (uint64_t Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) {
+    Type *ArgType = (*ArgIt).getType();
+    AB = AttributeFuncs::typeIncompatible(ArgType);
+    Clone->removeParamAttrs(Idx, AB);
+  }
+
+  AB = AttributeFuncs::typeIncompatible(ReturnType);
+  Clone->removeAttributes(AttributeList::ReturnIndex, AB);
+
+  DEBUG(dbgs() << "After Cloning and Function Signature expansion\n");
+  DEBUG(Clone->dump());
+
+  return Clone;
+}
+
+bool VecClone::isVectorOrLinearParamStore(Function *Clone,
+                                          std::vector<VectorKind> &ParmKinds,
+                                          Instruction *Inst) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+    Value *Op0 = Store->getOperand(0);
+    Function::arg_iterator ArgListIt = Clone->arg_begin();
+    Function::arg_iterator ArgListEnd = Clone->arg_end();
+
+    for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+      unsigned ParmIdx = ArgListIt->getArgNo();
+      if (&*ArgListIt == Op0 &&
+          (ParmKinds[ParmIdx].isVector() || ParmKinds[ParmIdx].isLinear())) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+BasicBlock *VecClone::splitEntryIntoLoop(Function *Clone, VectorVariant &V,
+                                         BasicBlock *EntryBlock) {
+
+  // EntryInsts contains all instructions that need to stay in the entry basic
+  // block. These instructions include allocas and stores involving vector and
+  // linear parameters to alloca. Linear parameter stores to alloca are kept in
+  // the entry block because there will be a load from this alloca in the loop
+  // for which we will apply the stride. Instructions involving uniform
+  // parameter stores to alloca should be sunk into the loop to maintain
+  // uniform behavior. All instructions involving private variables are also
+  // sunk into the loop.
+
+  SmallVector<Instruction *, 4> EntryInsts;
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+  BasicBlock::iterator BBIt = EntryBlock->begin();
+  BasicBlock::iterator BBEnd = EntryBlock->end();
+
+  for (; BBIt != BBEnd; ++BBIt) {
+    if (isa<AllocaInst>(BBIt) ||
+        isVectorOrLinearParamStore(Clone, ParmKinds, &*BBIt)) {
+      // If this is a store of a vector parameter, keep it in the entry block
+      // because it will be modified with the vector alloca reference. Since the
+      // parameter has already been expanded, this becomes a vector store (i.e.,
+      // packing instruction) that we do not want to appear in the scalar loop.
+      // It is correct to leave linear parameter stores in the entry or move
+      // them to the scalar loop, but leaving them in the entry block prevents
+      // an additional store inside the loop. Uniform parameter stores must be
+      // moved to the loop body to behave as uniform. Consider the following:
+      //
+      // __declspec(vector(uniform(x)))
+      // int foo(int a, int x) {
+      //   x++;
+      //   return (a + x);
+      // }
+      //
+      // Assume x = 1 for the call to foo. This implies x = 2 for the vector
+      // add. e.g., a[0:VL-1] + <2, 2, 2, 2>. If the initial store of x to the
+      // stack is done in the entry block outside of the loop, then x will be
+      // incremented by one each time within the loop because the increment of
+      // x will reside in the loop. Therefore, if the store of x is sunk into
+      // the loop, the initial value of 1 will always be stored to a temp
+      // before the increment, resulting in the value of 2 always being computed
+      // in the scalar loop.
+      EntryInsts.push_back(&*BBIt);
+    }
+  }
+
+  BasicBlock *LoopBlock =
+      EntryBlock->splitBasicBlock(EntryBlock->begin(), "simd.loop");
+
+  for (auto *Inst : EntryInsts) {
+    Inst->removeFromParent();
+    Inst->insertBefore(EntryBlock->getTerminator());
+  }
+
+  DEBUG(dbgs() << "After Entry Block Split\n");
+  DEBUG(Clone->dump());
+
+  return LoopBlock;
+}
+
+BasicBlock *VecClone::splitLoopIntoReturn(Function *Clone,
+                                          BasicBlock *LoopBlock) {
+
+  // Determine the basic block with the return. For simple cases, the 'ret'
+  // instruction will be part of the entry block. In this case, separate the
+  // 'ret' into a new basic block because we don't want this as part of the
+  // loop. For more complex cases, the 'ret' and corresponding instructions
+  // (i.e., load from auto variable) will already be in a separate basic block,
+  // so no need to split here.
+
+  Instruction *SplitPt = LoopBlock->getTerminator();
+
+  if (ReturnInst *Return = dyn_cast<ReturnInst>(SplitPt)) {
+
+    // If the return is from a preceeding load, make sure the load is also put
+    // in the return block. This is the old scalar load that will end up getting
+    // replaced with the vector return and will get cleaned up later.
+
+    // Make sure this is not a void function before getting the return
+    // operand.
+    if (!Clone->getReturnType()->isVoidTy()) {
+      Value *RetOp = Return->getOperand(0);
+      Value::use_iterator UseIt = RetOp->use_begin();
+      Value::use_iterator UseEnd = RetOp->use_end();
+
+      for (; UseIt != UseEnd; ++UseIt) {
+        LoadInst *RetLoad = dyn_cast<LoadInst>(*UseIt);
+        if (RetLoad) {
+          SplitPt = RetLoad;
+        }
+      }
+    }
+  }
+
+  Function::iterator ReturnBlockIt = Clone->end();
+  BasicBlock *ReturnBlock;
+  if (dyn_cast<LoadInst>(SplitPt) || dyn_cast<ReturnInst>(SplitPt)) {
+    ReturnBlock = LoopBlock->splitBasicBlock(SplitPt, "return");
+  } else {
+    ReturnBlockIt = Clone->end();
+    ReturnBlockIt--;
+    ReturnBlock = &*ReturnBlockIt;
+  }
+
+  return ReturnBlock;
+}
+
+void VecClone::updateReturnPredecessors(Function *Clone,
+                                        BasicBlock *LoopExitBlock,
+                                        BasicBlock *ReturnBlock) {
+  // Update the branches of the ReturnBlock predecessors to point back to
+  // LoopBlock if the index is less than VL.
+
+  // First, collect the basic blocks to be updated since we don't want to update
+  // the CFG while iterating through it.
+  SmallVector<BranchInst *, 4> BranchesToUpdate;
+  Function::iterator FI = Clone->begin();
+  Function::iterator FE = Clone->end();
+  for (; FI != FE; ++FI) {
+
+    BasicBlock::iterator BBI = FI->begin();
+    BasicBlock::iterator BBE = FI->end();
+
+    for (; BBI != BBE; ++BBI) {
+
+      BranchInst *Branch = dyn_cast<BranchInst>(BBI);
+
+      if (Branch) {
+        unsigned NumSucc = Branch->getNumSuccessors();
+
+        for (unsigned I = 0; I < NumSucc; ++I) {
+          if (Branch->getSuccessor(I) == ReturnBlock) {
+            BranchesToUpdate.push_back(Branch);
+          }
+        }
+      }
+    }
+  }
+
+  // Now, do the actual update. The code below handles both conditional and
+  // unconditional branches because we loop through all successors of the
+  // branch to see if any of them point to the ReturnBlock.
+  for (unsigned I = 0; I < BranchesToUpdate.size(); ++I) {
+    unsigned int NumOps = BranchesToUpdate[I]->getNumSuccessors();
+    for (unsigned Idx = 0; Idx < NumOps; ++Idx) {
+      BasicBlock *Successor = BranchesToUpdate[I]->getSuccessor(Idx);
+      if (Successor == ReturnBlock) {
+        BranchesToUpdate[I]->setOperand(Idx, LoopExitBlock);
+      }
+    }
+  }
+}
+
+BasicBlock *VecClone::createLoopExit(Function *Clone, BasicBlock *ReturnBlock) {
+  BasicBlock *LoopExitBlock = BasicBlock::Create(
+      Clone->getContext(), "simd.loop.exit", Clone, ReturnBlock);
+
+  updateReturnPredecessors(Clone, LoopExitBlock, ReturnBlock);
+  return LoopExitBlock;
+}
+
+PHINode *VecClone::createPhiAndBackedgeForLoop(
+    Function *Clone, BasicBlock *EntryBlock, BasicBlock *LoopBlock,
+    BasicBlock *LoopExitBlock, BasicBlock *ReturnBlock, int VectorLength) {
+
+  // Create the phi node for the top of the loop block and add the back
+  // edge to the loop from the loop exit.
+
+  PHINode *Phi = PHINode::Create(Type::getInt32Ty(Clone->getContext()), 2,
+                                 "index", &*LoopBlock->getFirstInsertionPt());
+
+  Constant *Inc = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 1);
+  Constant *IndInit =
+      ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 0);
+
+  Instruction *Induction =
+      BinaryOperator::CreateNUWAdd(Phi, Inc, "indvar", LoopExitBlock);
+
+  Constant *VL =
+      ConstantInt::get(Type::getInt32Ty(Clone->getContext()), VectorLength);
+
+  Instruction *VLCmp =
+      new ICmpInst(*LoopExitBlock, CmpInst::ICMP_ULT, Induction, VL, "vl.cond");
+
+  BranchInst::Create(LoopBlock, ReturnBlock, VLCmp, LoopExitBlock);
+
+  Phi->addIncoming(IndInit, EntryBlock);
+  Phi->addIncoming(Induction, LoopExitBlock);
+
+  DEBUG(dbgs() << "After Loop Insertion\n");
+  DEBUG(Clone->dump());
+
+  return Phi;
+}
+
+Instruction *
+VecClone::expandVectorParameters(Function *Clone, VectorVariant &V,
+                                 BasicBlock *EntryBlock,
+                                 std::vector<ParmRef *> &VectorParmMap) {
+  // For vector parameters, expand the existing alloca (if there is one) to a
+  // vector. If there isn't one, create a VL-sized alloca for it. Then, bitcast
+  // the vector and store this instruction in a map. The map is later used to
+  // insert the new instructions and to replace the old scalar memory
+  // references. If there are no parameters, then the function simply does not
+  // perform any expansion since we iterate over the function's arg list. We
+  // must always have a vector alloca for vector parameters so that we can
+  // bitcast them to a scalar pointer that can be loaded from using the loop
+  // index.
+
+  Instruction *Mask = nullptr;
+  SmallVector<StoreInst *, 4> StoresToInsert;
+
+  Function::arg_iterator ArgIt = Clone->arg_begin();
+  Function::arg_iterator ArgEnd = Clone->arg_end();
+
+  unsigned LastArg = Clone->arg_size() - 1;
+  unsigned ArgIdx = 0;
+
+  for (; ArgIt != ArgEnd; ++ArgIt) {
+
+    User::user_iterator UserIt = ArgIt->user_begin();
+    User::user_iterator UserEnd = ArgIt->user_end();
+
+    VectorType *VecType = dyn_cast<VectorType>(ArgIt->getType());
+
+    if (VecType) {
+
+      // Some args other than the mask may not have users, but have not been
+      // removed as dead. In those cases, just go on to the next argument.
+      // There's no need to expand non-mask parameters with no users.
+      bool MaskArg = V.isMasked() && ArgIdx == LastArg;
+
+      if (!(!MaskArg && ArgIt->getNumUses() == 0)) {
+
+        // Create a new vector alloca and bitcast to a pointer to the element
+        // type. The following is an example of what the cast should look like:
+        //
+        // %veccast = bitcast <2 x i32>* %vec_a.addr to i32*
+        //
+        // geps using the bitcast will appear in a scalar form instead of
+        // casting to an array or using vector. For example,
+        //
+        // %vecgep1 = getelementptr i32, i32* %veccast, i32 %index
+        //
+        // instead of:
+        //
+        // getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 1
+        //
+        // We do this to put the geps in a more scalar form.
+
+        const DataLayout &DL = Clone->getParent()->getDataLayout();
+        AllocaInst *VecAlloca = new AllocaInst(VecType, DL.getAllocaAddrSpace(),
+                                               "vec." + ArgIt->getName());
+        insertInstruction(VecAlloca, EntryBlock);
+        PointerType *ElemTypePtr = PointerType::get(
+            VecType->getElementType(), VecAlloca->getType()->getAddressSpace());
+
+        BitCastInst *VecParmCast = nullptr;
+        if (MaskArg) {
+          Mask = new BitCastInst(VecAlloca, ElemTypePtr, "mask.cast");
+        } else {
+          VecParmCast = new BitCastInst(VecAlloca, ElemTypePtr,
+                                        "vec." + ArgIt->getName() + ".cast");
+          insertInstruction(VecParmCast, EntryBlock);
+        }
+
+        StoreInst *StoreUser = nullptr;
+        AllocaInst *Alloca = nullptr;
+        ParmRef *PRef = new ParmRef();
+
+        for (; UserIt != UserEnd; ++UserIt) {
+
+          StoreUser = dyn_cast<StoreInst>(*UserIt);
+
+          if (StoreUser) {
+            // For non-mask parameters, find the initial store of the parameter
+            // to an alloca instruction. Map this alloca to the vector bitcast
+            // created above so that we can update the old scalar references.
+            Alloca = dyn_cast<AllocaInst>(UserIt->getOperand(1));
+            PRef->VectorParm = Alloca;
+            break;
+          }
+        }
+
+        if (!Alloca && !Mask) {
+          // Since Mem2Reg has run, there is no existing scalar store for
+          // the parameter, but we must still pack (store) the expanded vector
+          // parameter to a new vector alloca. This store is created here and
+          // put in a container for later insertion. We cannot insert it here
+          // since this will be a new user of the parameter and we are still
+          // iterating over the original users of the parameter. This will
+          // invalidate the iterator. We also map the parameter directly to the
+          // vector bitcast so that we can later update any users of the
+          // parameter.
+          Value *ArgValue = dyn_cast<Value>(ArgIt);
+          StoreInst *Store = new StoreInst(ArgValue, VecAlloca);
+          StoresToInsert.push_back(Store);
+          PRef->VectorParm = ArgValue;
+        }
+
+        if (!Mask) {
+          // Mapping not needed for the mask parameter because there will
+          // be no users of it to replace. This parameter will only be used to
+          // introduce if conditions on each mask bit.
+          PRef->VectorParmCast = VecParmCast;
+          VectorParmMap.push_back(PRef);
+        }
+      }
+    }
+
+    ArgIdx++;
+  }
+
+  // Insert any necessary vector parameter stores here. This is needed for when
+  // there were no existing scalar stores that we can update to vector stores
+  // for the parameter. This is needed when Mem2Reg has registerized parameters.
+  // The stores are inserted after the allocas in the entry block.
+  for (auto *Inst : StoresToInsert) {
+    insertInstruction(Inst, EntryBlock);
+  }
+
+  return Mask;
+}
+
+Instruction *VecClone::createExpandedReturn(Function *Clone,
+                                            BasicBlock *EntryBlock,
+                                            VectorType *ReturnType) {
+  // Expand the return temp to a vector.
+
+  VectorType *AllocaType = dyn_cast<VectorType>(Clone->getReturnType());
+
+  const DataLayout &DL = Clone->getParent()->getDataLayout();
+  AllocaInst *VecAlloca =
+      new AllocaInst(AllocaType, DL.getAllocaAddrSpace(), "vec.retval");
+  insertInstruction(VecAlloca, EntryBlock);
+  PointerType *ElemTypePtr = PointerType::get(
+      ReturnType->getElementType(), VecAlloca->getType()->getAddressSpace());
+
+  BitCastInst *VecCast = new BitCastInst(VecAlloca, ElemTypePtr, "ret.cast");
+  insertInstruction(VecCast, EntryBlock);
+
+  return VecCast;
+}
+
+Instruction *VecClone::expandReturn(Function *Clone, BasicBlock *EntryBlock,
+                                    BasicBlock *LoopBlock,
+                                    BasicBlock *ReturnBlock,
+                                    std::vector<ParmRef *> &VectorParmMap) {
+  // Determine how the return is currently handled, since this will determine
+  // if a new vector alloca is required for it. For simple functions, an alloca
+  // may not have been created for the return value. The function may just
+  // simply return a value defined by some operation that now exists within the
+  // loop. If an alloca was generated already, then the return block will load
+  // from it and then return. Thus, we look for a return resulting from a load
+  // in the return block. If found, we have already expanded all alloca
+  // instructions to vector types and the old scalar references have already
+  // been replaced with them. In this case, we only need to pack the results
+  // from the vector alloca into a temp and return the temp. If a vector alloca
+  // was not generated for the return, we need to add one for it because we have
+  // a scalar reference in the loop that needs to be replaced. After creating
+  // the new vector alloca, replace the reference to it in the loop and then
+  // pack the results into a temp and return it.
+  //
+  // Example 1: // alloca not generated in entry block
+  //
+  // loop:
+  //   ... // some set of instructions
+  //   %add1 = add nsw i32 %1, %2
+  //   br label %loop.exit (loop exit contains br to return block)
+  //
+  // return:
+  //   ret i32 %add1
+  //
+  //
+  // Example 2:
+  //
+  // loop:
+  //  ... // some set of instructions
+  //   %vecgep1 = getelementptr <2 x i32>* %vec_ret, i32 0, i32 %index
+  //   store i32 %add2, i32* %vecgep1
+  //   br label %loop.exit (loop exit contains br to return block)
+  //
+  // return:
+  //   %7 = load i32, i32*, %retval // the original scalar alloca
+  //   ret i32 %7
+  //
+
+  ReturnInst *FuncReturn = dyn_cast<ReturnInst>(ReturnBlock->getTerminator());
+  assert(FuncReturn && "Expected ret instruction to terminate the return\
+                        basic block");
+
+  LoadInst *LoadFromAlloca = dyn_cast<LoadInst>(FuncReturn->getOperand(0));
+
+  // We need to generate a vector alloca for the return vector.
+  // Two cases exist, here:
+  //
+  // 1) For simple functions, the return is a temp defined within the
+  //    loop body and the temp is not loaded from an alloca, or the return is
+  //   a constant. (obviously, also not loaded from an alloca)
+  //
+  // 2) The return temp traces back to an alloca.
+  //
+  // For both cases, generate a vector alloca so that we can later load from it
+  // and return the vector temp from the function. The alloca is used to load
+  // and store from so that the scalar loop contains load/store/gep
+  // instructions. This enables AVR construction to remain straightforward.
+  // E.g., we don't need to worry about figuring out how to represent
+  // insert/extract when building AVR nodes. This keeps consistent with how ICC
+  // is operating.
+  //
+  // Additionally, for case 1 we must generate a gep and store after the
+  // instruction that defines the original return temp, so that we can store
+  // the result into the proper index of the return vector. For case 2, we must
+  // go into the loop and replace the old scalar alloca reference with the one
+  // just created as vector.
+
+  Instruction *VecReturn = NULL;
+  VectorType *ReturnType = dyn_cast<VectorType>(Clone->getReturnType());
+
+  if (!LoadFromAlloca) {
+
+    // Case 1
+
+    VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType);
+    Value *RetVal = FuncReturn->getReturnValue();
+    Instruction *RetFromTemp = dyn_cast<Instruction>(RetVal);
+
+    Instruction *InsertPt;
+    Value *ValToStore;
+    Instruction *Phi = &*LoopBlock->begin();
+
+    if (RetFromTemp) {
+      // If we're returning from an SSA temp, set the insert point to the
+      // definition of the temp.
+      InsertPt = RetFromTemp;
+      ValToStore = RetFromTemp;
+    } else {
+      // If we're returning a constant, then set the insert point to the loop
+      // phi. From here, a store to the vector using the constant is inserted.
+      InsertPt = Phi;
+      ValToStore = RetVal;
+    }
+
+    // Generate a gep from the bitcast of the vector alloca used for the return
+    // vector.
+    GetElementPtrInst *VecGep =
+        GetElementPtrInst::Create(ReturnType->getElementType(), VecReturn, Phi,
+                                  VecReturn->getName() + ".gep");
+    VecGep->insertAfter(InsertPt);
+
+    // Store the constant or temp to the appropriate lane in the return vector.
+    StoreInst *VecStore = new StoreInst(ValToStore, VecGep);
+    VecStore->insertAfter(VecGep);
+
+  } else {
+
+    // Case 2
+
+    AllocaInst *Alloca = dyn_cast<AllocaInst>(LoadFromAlloca->getOperand(0));
+    bool AllocaFound = false;
+    unsigned ParmIdx = 0;
+
+    for (; ParmIdx < VectorParmMap.size(); ParmIdx++) {
+      Value *ParmVal = VectorParmMap[ParmIdx]->VectorParm;
+      if (ParmVal == Alloca)
+        AllocaFound = true;
+    }
+
+    if (AllocaFound) {
+      // There's already a vector alloca created for the return, which is the
+      // same one used for the parameter. E.g., we're returning the updated
+      // parameter.
+      VecReturn = VectorParmMap[ParmIdx]->VectorParmCast;
+    } else {
+      // A new return vector is needed because we do not load the return value
+      // from an alloca.
+      VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType);
+      ParmRef *PRef = new ParmRef();
+      PRef->VectorParm = Alloca;
+      PRef->VectorParmCast = VecReturn;
+      VectorParmMap.push_back(PRef);
+    }
+  }
+
+  return VecReturn;
+}
+
+Instruction *VecClone::expandVectorParametersAndReturn(
+    Function *Clone, VectorVariant &V, Instruction **Mask,
+    BasicBlock *EntryBlock, BasicBlock *LoopBlock, BasicBlock *ReturnBlock,
+    std::vector<ParmRef *> &VectorParmMap) {
+  // If there are no parameters, then this function will do nothing and this
+  // is the expected behavior.
+  *Mask = expandVectorParameters(Clone, V, EntryBlock, VectorParmMap);
+
+  // If the function returns void, then don't attempt to expand to vector.
+  Instruction *ExpandedReturn = ReturnBlock->getTerminator();
+  if (!Clone->getReturnType()->isVoidTy()) {
+    ExpandedReturn =
+        expandReturn(Clone, EntryBlock, LoopBlock, ReturnBlock, VectorParmMap);
+  }
+
+  // So, essentially what has been done to this point is the creation and
+  // insertion of the vector alloca instructions. Now, we insert the bitcasts of
+  // those instructions, which have been stored in the map. The insertion of the
+  // vector bitcast to element type pointer is done at the end of the EntryBlock
+  // to ensure that any initial stores of vector parameters have been done
+  // before the cast.
+
+  std::vector<ParmRef *>::iterator MapIt;
+  for (auto MapIt : VectorParmMap) {
+    Instruction *ExpandedCast = MapIt->VectorParmCast;
+    if (!ExpandedCast->getParent()) {
+      insertInstruction(ExpandedCast, EntryBlock);
+    }
+  }
+
+  // Insert the mask parameter store to alloca and bitcast if this is a masked
+  // variant.
+  if (*Mask) {
+    // Mask points to the bitcast of the alloca instruction to element type
+    // pointer. Insert the bitcast after all of the other bitcasts for vector
+    // parameters.
+    insertInstruction(*Mask, EntryBlock);
+
+    Value *MaskVector = (*Mask)->getOperand(0);
+
+    // MaskParm points to the function's mask parameter.
+    Function::arg_iterator MaskParm = Clone->arg_end();
+    MaskParm--;
+
+    // Find the last parameter store in the function entry block and insert the
+    // the store of the mask parameter after it. We do this just to make the
+    // LLVM IR easier to read. If there are no parameters, just insert the store
+    // before the terminator. For safety, if we cannot find a store, then insert
+    // this store after the last alloca. At this point, there will at least be
+    // an alloca for either a parameter or return. This code just ensures that
+    // the EntryBlock instructions are grouped by alloca, followed by store,
+    // followed by bitcast for readability reasons.
+
+    StoreInst *MaskStore = new StoreInst(&*MaskParm, MaskVector);
+    insertInstruction(MaskStore, EntryBlock);
+  }
+
+  DEBUG(dbgs() << "After Parameter/Return Expansion\n");
+  DEBUG(Clone->dump());
+
+  return ExpandedReturn;
+}
+
+bool VecClone::typesAreCompatibleForLoad(Type *GepType, Type *LoadType) {
+  // GepType will always be a pointer since this refers to an alloca for a
+  // vector.
+  PointerType *GepPtrTy = dyn_cast<PointerType>(GepType);
+  Type *LoadFromTy = GepPtrTy->getElementType();
+  Type *LoadToTy = LoadType;
+
+  // Dereferencing pointers in LLVM IR means that we have to have a load for
+  // each level of indirection. This means that we load from a gep and the
+  // resulting load value type is reduced by one level of indirection. For
+  // example, we load from a gep of i32* to a temp that has an i32 type. We
+  // cannot do multiple levels of dereferencing in a single load. For example,
+  // we cannot load from a gep of i32** to an i32. This requires two loads.
+  //
+  // Legal Case: GepType = i32**, LoadFromTy = i32*,
+  //             LoadType = i32*, LoadToTy = i32*
+  //
+  // %vec.b.elem.2 = load i32*, i32** %vec.b.cast.gep1
+  //
+  // In this case, since both are pointers, types will be considered equal by
+  // LLVM, so we must continue getting the element types of each pointer type
+  // until one is no longer a pointer type. Then do an equality check.
+  //
+  // Legal Case: GepType = i32*, LoadFromTy = i32,
+  //             LoadType = i32, LoadToTy = i32
+  //
+  // %vec.b.elem.2 = load i32, i32* %vec.b.cast.gep1
+  //
+  // Ready to compare as is
+  //
+  // Illegal Case: GepType = i32**, LoadFromTy = i32*
+  //               LoadType = i32, LoadToTy = i32
+  //
+  // %vec.b.elem.2 = load i32, i32** %vec.b.cast.gep1
+  //
+  // This case arises due to differences in the LLVM IR at -O0 and >= -O1.
+  // For >= -O1, Mem2Reg registerizes parameters and there are no alloca
+  // instructions created for function parameters. At -O0, vector parameters
+  // are expanded and we modify the existing alloca that was used for the scalar
+  // parameter. When there is no alloca for vector parameters, we must create
+  // one for them. Thus, we have introduced an additional level of indirection
+  // for users of parameters at >= -O1. This can become a problem for load
+  // instructions and results in this illegal case. This function helps to
+  // check that we are not attempting to do an extra level of indirection
+  // within the load instructions for elements of vector parameters in the
+  // simd loop. If an illegal case is encountered, an additional load is
+  // inserted to account for the extra level of indirection and any users are
+  // updated accordingly.
+
+  while (LoadFromTy->getTypeID() == Type::PointerTyID &&
+         LoadToTy->getTypeID() == Type::PointerTyID) {
+
+    PointerType *FromPtrTy = cast<PointerType>(LoadFromTy);
+    PointerType *ToPtrTy = cast<PointerType>(LoadToTy);
+
+    LoadFromTy = FromPtrTy->getElementType();
+    LoadToTy = ToPtrTy->getElementType();
+  }
+
+  if (LoadFromTy->getTypeID() == LoadToTy->getTypeID()) {
+    return true;
+  }
+
+  return false;
+}
+
+void VecClone::updateScalarMemRefsWithVector(
+    Function *Clone, Function &F, BasicBlock *EntryBlock,
+    BasicBlock *ReturnBlock, PHINode *Phi,
+    std::vector<ParmRef *> &VectorParmMap) {
+  // This function replaces the old scalar uses of a parameter with a reference
+  // to the new vector one. A gep is inserted using the vector bitcast created
+  // in the entry block and any uses of the parameter are replaced with this
+  // gep. The only users that will not be updated are those in the entry block
+  // that do the initial store to the vector alloca of the parameter.
+
+  std::vector<ParmRef *>::iterator VectorParmMapIt;
+
+  for (auto VectorParmMapIt : VectorParmMap) {
+
+    SmallVector<Instruction *, 4> InstsToUpdate;
+    Value *Parm = VectorParmMapIt->VectorParm;
+    Instruction *Cast = VectorParmMapIt->VectorParmCast;
+
+    for (User *U : Parm->users()) {
+      InstsToUpdate.push_back(dyn_cast<Instruction>(U));
+    }
+
+    for (unsigned I = 0; I < InstsToUpdate.size(); ++I) {
+
+      Instruction *User = InstsToUpdate[I];
+      if (!(dyn_cast<StoreInst>(User) && User->getParent() == EntryBlock)) {
+
+        BitCastInst *BitCast = dyn_cast<BitCastInst>(Cast);
+        PointerType *BitCastType = dyn_cast<PointerType>(BitCast->getType());
+        Type *PointeeType = BitCastType->getElementType();
+
+        GetElementPtrInst *VecGep = GetElementPtrInst::Create(
+            PointeeType, BitCast, Phi, BitCast->getName() + ".gep", User);
+
+        unsigned NumOps = User->getNumOperands();
+        for (unsigned I = 0; I < NumOps; ++I) {
+          if (User->getOperand(I) == Parm) {
+
+            bool TypesAreCompatible = false;
+
+            if (isa<LoadInst>(User)) {
+              TypesAreCompatible =
+                  typesAreCompatibleForLoad(VecGep->getType(), User->getType());
+            }
+
+            if ((isa<LoadInst>(User) && TypesAreCompatible) ||
+                isa<StoreInst>(User)) {
+              // If the user is a load/store and the dereferencing is legal,
+              // then just modify the load/store operand to use the gep.
+              User->setOperand(I, VecGep);
+            } else {
+              // Otherwise, we need to load the value from the gep first before
+              // using it. This effectively loads the particular element from
+              // the vector parameter.
+              LoadInst *ParmElemLoad =
+                  new LoadInst(VecGep, "vec." + Parm->getName() + ".elem");
+              ParmElemLoad->insertAfter(VecGep);
+              User->setOperand(I, ParmElemLoad);
+            }
+          }
+        }
+      } else {
+        // The user is the parameter store to alloca in the entry block. Replace
+        // the old scalar alloca with the new vector one.
+        AllocaInst *VecAlloca = dyn_cast<AllocaInst>(Cast->getOperand(0));
+        User->setOperand(1, VecAlloca);
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "After Alloca Replacement\n");
+  DEBUG(Clone->dump());
+}
+
+Instruction *VecClone::generateStrideForParameter(Function *Clone,
+                                                  Argument *Arg,
+                                                  Instruction *ParmUser,
+                                                  int Stride, PHINode *Phi) {
+  // For linear values, a mul/add sequence is needed to generate the correct
+  // value. i.e., val = linear_var * stride + loop_index;
+  //
+  // StrideInst is returned as the last instruction needed to update the users
+  // of the old parameter reference.
+  Instruction *StrideInst = nullptr;
+
+  // The phi for the loop index is generated by this pass as i32, which is
+  // why the mul instruction is i32.
+  Constant *StrideConst =
+      ConstantInt::get(Type::getInt32Ty(Clone->getContext()), Stride);
+
+  Instruction *Mul = BinaryOperator::CreateMul(StrideConst, Phi, "stride.mul");
+
+  // Insert the stride related instructions after the user if the instruction
+  // involves a redefinition of the parameter. For example, a load from the
+  // parameter's associated alloca or a cast. For these situations, we want to
+  // apply the stride to this Value. For other instructions, e.g., add, the
+  // instruction computing the stride must be inserted before the usage of it.
+
+  if (!isa<UnaryInstruction>(ParmUser)) {
+    Mul->insertBefore(ParmUser);
+  } else {
+    Mul->insertAfter(ParmUser);
+  }
+
+  if (Arg->getType()->isPointerTy()) {
+
+    // Linear updates to pointer parameters involves an address calculation, so
+    // use gep. To properly update linear pointers we only need to multiply the
+    // loop index and stride since gep is indexed starting at 0 from the base
+    // address passed to the vector function.
+    PointerType *ParmPtrType = dyn_cast<PointerType>(Arg->getType());
+
+    // The base address used for linear gep computations.
+    Value *BaseAddr = nullptr;
+    StringRef RefName;
+
+    if (LoadInst *ParmLoad = dyn_cast<LoadInst>(ParmUser)) {
+      // We are loading from the alloca of the pointer parameter (no Mem2Reg)
+      // i.e., loading a pointer to another Value.
+      BaseAddr = ParmUser;
+      RefName = ParmLoad->getOperand(0)->getName();
+    } else {
+      // The user is using the pointer parameter directly.
+      BaseAddr = Arg;
+      RefName = BaseAddr->getName();
+    }
+
+    // Mul is always generated as i32 since it is calculated using the i32 loop
+    // phi that is inserted by this pass. No cast on Mul is necessary because
+    // gep can use a base address of one type with an index of another type.
+    GetElementPtrInst *LinearParmGep = GetElementPtrInst::Create(
+        ParmPtrType->getElementType(), BaseAddr, Mul, RefName + ".gep");
+
+    LinearParmGep->insertAfter(Mul);
+    StrideInst = LinearParmGep;
+  } else {
+    // Note: the phi for the loop index is generated by this pass as i32. 
+    // Also, Mul above is generated as i32 because the phi type is always i32.
+    // However, ParmUser may be another type, so 
+    //
+    // Generate the instruction that computes the stride.
+    //
+    // Example of applying stride:
+    //
+    // define float @dowork(float* nocapture readonly %a, float %b, i64 %k) {
+    // entry:
+    //   %arrayidx = getelementptr inbounds float, float* %a, i64 %k
+    //   %0 = load float, float* %arrayidx, align 4, !tbaa !2
+    //   %call = tail call float @sinf(float %0) #4
+    //   %add = fadd float %call, %b
+    //   %conv = sitofp i64 %k to float
+    //   %add1 = fadd float %add, %conv
+    //   ret float %add1
+    // }
+    //
+    // Case 1: stride for %k must be applied to %conv after %conv so that
+    //         %add1 gets the updated stride value.
+    //
+    // Case 2: stride for %k in %arrayidx must be applied before the gep,
+    //         so that the correct index is used. This is the same for other
+    //         non-unary type instructions.
+    //
+    BinaryOperator *Add;
+    Value *StrideVal;
+    Type *StrideTy;
+    StringRef TempName = "stride.add";
+    if (isa<UnaryInstruction>(ParmUser)) {
+      // Case 1
+      StrideVal = ParmUser;
+      StrideTy = ParmUser->getType();
+    } else {
+      // Case 2
+      StrideVal = Arg;
+      StrideTy = Arg->getType();
+    }
+
+    // Stride calculations may need typecasting since the index multiply
+    // is i32, but the users of the linear value may not be i32. 
+    //
+    // Example (stride applied to %conv):
+    //
+    // %conv = sitofp i64 %k to float
+    // %stride.mul = mul i32 1, %index
+    // %stride.cast = bitcast i32 %stride.mul to float
+    // %stride.add = fadd float %conv, %stride.cast
+    // %add1 = fadd float %add, %stride.add
+    //
+    if (StrideTy != Mul->getType()) {
+      Instruction *MulConv =
+        CastInst::CreateSExtOrBitCast(Mul, StrideTy, "stride.cast");
+      MulConv->insertAfter(Mul);
+      Mul = MulConv;
+    }
+
+    if (StrideTy->isFloatingPointTy()) {
+      Add = BinaryOperator::CreateFAdd(StrideVal, Mul, TempName);
+    } else if (StrideTy->isIntegerTy()) {
+      Add = BinaryOperator::CreateAdd(StrideVal, Mul, TempName);
+    } else {
+      llvm_unreachable("Expected integer or floating point type");
+    }
+
+    Add->insertAfter(Mul);
+    StrideInst = Add;
+  }
+
+  return StrideInst;
+}
+
+void VecClone::updateLinearReferences(Function *Clone, Function &F,
+                                      VectorVariant &V, PHINode *Phi) {
+  // Add stride to parameters marked as linear. This is done by finding all
+  // users of the scalar alloca associated with the parameter. The user should
+  // be a load from this alloca to a temp. The stride is then added to this temp
+  // and its uses are replaced with the new temp. Or, if Mem2Reg eliminates the
+  // alloca/load, the parameter is used directly and this use is updated with
+  // the stride.
+
+  Function::arg_iterator ArgListIt = Clone->arg_begin();
+  Function::arg_iterator ArgListEnd = Clone->arg_end();
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+
+  for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+
+    User::user_iterator ArgUserIt = ArgListIt->user_begin();
+    User::user_iterator ArgUserEnd = ArgListIt->user_end();
+    unsigned ParmIdx = ArgListIt->getArgNo();
+    SmallVector<Instruction *, 4> LinearParmUsers;
+
+    if (ParmKinds[ParmIdx].isLinear()) {
+
+      int Stride = ParmKinds[ParmIdx].getStride();
+
+      for (; ArgUserIt != ArgUserEnd; ++ArgUserIt) {
+
+        // Collect all uses of the parameter so that they can later be used to
+        // apply the stride.
+        Instruction *ParmUser = dyn_cast<Instruction>(*ArgUserIt);
+        if (StoreInst *ParmStore = dyn_cast<StoreInst>(ParmUser)) {
+
+          // This code traces the store of the parameter to its associated
+          // alloca. Then, we look for a load from that alloca to a temp. This
+          // is the value we need to add the stride to. This is for when
+          // Mem2Reg has not been run.
+          AllocaInst *Alloca = dyn_cast<AllocaInst>(ArgUserIt->getOperand(1));
+
+          if (Alloca) {
+            for (auto *AU : Alloca->users()) {
+
+              LoadInst *ParmLoad = dyn_cast<LoadInst>(AU);
+
+              if (ParmLoad) {
+                // The parameter is being loaded from an alloca to a new SSA
+                // temp. We must replace the users of this load with an
+                // instruction that adds the result of this load with the
+                // stride.
+                LinearParmUsers.push_back(ParmLoad);
+              }
+            }
+          } else {
+            // Mem2Reg has run, so the parameter is directly referenced in the
+            // store instruction.
+            LinearParmUsers.push_back(ParmStore);
+          }
+        } else {
+          // Mem2Reg has registerized the parameters, so users of it will use
+          // it directly, and not through a load of the parameter.
+          LinearParmUsers.push_back(ParmUser);
+        }
+      }
+
+      for (unsigned I = 0; I < LinearParmUsers.size(); I++) {
+        // For each user of parameter:
+        //
+        // We must deal with two cases here, based on whether Mem2Reg has been
+        // run.
+        //
+        // Example:
+        //
+        // __declspec(vector(linear(i:1),uniform(x),vectorlength(4)))
+        // extern int foo(int i, int x) {
+        //   return (x + i);
+        // }
+        //
+        // 1) We are loading the parameter from an alloca and the SSA temp as
+        //    as a result of the load is what we need to add the stride to.
+        //    Then, any users of that temp must be replaced. The only load
+        //    instructions put in the collection above are guaranteed to be
+        //    associated with the parameter's alloca. Thus, we only need to
+        //    check to see if a load is in the map to know what to do.
+        //
+        // Before Linear Update:
+        //
+        // simd.loop:                     ; preds = %simd.loop.exit, %entry
+        //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+        //   store i32 %x, i32* %x.addr, align 4
+        //   %0 = load i32, i32* %x.addr, align 4
+        //   %1 = load i32, i32* %i.addr, align 4 <--- %i
+        //   %add = add nsw i32 %0, %1            <--- replace %1 with stride
+        //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+        //   store i32 %add, i32* %ret.cast.gep
+        //   br label %simd.loop.exit
+        //
+        // After Linear Update:
+        //
+        // simd.loop:                     ; preds = %simd.loop.exit, %entry
+        //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+        //   store i32 %x, i32* %x.addr, align 4
+        //   %0 = load i32, i32* %x.addr, align 4
+        //   %1 = load i32, i32* %i.addr, align 4
+        //   %stride.mul = mul i32 1, %index
+        //   %stride.add = add i32 %1, %stride.mul <--- stride
+        //   %add = add nsw i32 %0, %stride.add    <--- new %i with stride
+        //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+        //   store i32 %add, i32* %ret.cast.gep
+        //   br label %simd.loop.exit
+        //
+        // 2) The user uses the parameter directly, and so we must apply the
+        //    stride directly to the parameter. Any users of the parameter
+        //    must then be updated.
+        //
+        // Before Linear Update:
+        //
+        // simd.loop:                     ; preds = %simd.loop.exit, %entry
+        //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+        //   %add = add nsw i32 %x, %i <-- direct usage of %i
+        //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+        //   store i32 %add, i32* %ret.cast.gep
+        //   br label %simd.loop.exit
+        //
+        // After Linear Update:
+        //
+        // simd.loop:                     ; preds = %simd.loop.exit, %entry
+        //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+        //   %stride.mul = mul i32 1, %index
+        //   %stride.add = add i32 %i, %stride.mul <--- stride
+        //   %add = add nsw i32 %x, %stride.add    <--- new %i with stride
+        //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+        //   store i32 %add, i32* %ret.cast.gep
+        //   br label %simd.loop.exit
+
+        Instruction *StrideInst = generateStrideForParameter(
+            Clone, &*ArgListIt, LinearParmUsers[I], Stride, Phi);
+
+        SmallVector<Instruction *, 4> InstsToUpdate;
+        Value *ParmUser;
+
+        if (isa<UnaryInstruction>(LinearParmUsers[I])) {
+          // Case 1
+          ParmUser = LinearParmUsers[I];
+          User::user_iterator StrideUserIt = LinearParmUsers[I]->user_begin();
+          User::user_iterator StrideUserEnd = LinearParmUsers[I]->user_end();
+
+          // Find the users of the redefinition of the parameter so that we
+          // can apply the stride to those instructions.
+          for (; StrideUserIt != StrideUserEnd; ++StrideUserIt) {
+            Instruction *StrideUser = dyn_cast<Instruction>(*StrideUserIt);
+            if (StrideUser != StrideInst) {
+              // We've already inserted the stride which is now also a user of
+              // the parameter, so don't update that instruction. Otherwise,
+              // we'll create a self reference. Hence, why we don't use
+              // replaceAllUsesWith().
+              InstsToUpdate.push_back(StrideUser);
+            }
+          }
+        } else {
+          // Case 2
+          ParmUser = &*ArgListIt;
+          InstsToUpdate.push_back(LinearParmUsers[I]);
+        }
+
+        // Replace the old references to the parameter with the instruction
+        // that applies the stride.
+        for (unsigned J = 0; J < InstsToUpdate.size(); ++J) {
+          unsigned NumOps = InstsToUpdate[J]->getNumOperands();
+          for (unsigned K = 0; K < NumOps; ++K) {
+            if (InstsToUpdate[J]->getOperand(K) == ParmUser) {
+              InstsToUpdate[J]->setOperand(K, StrideInst);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "After Linear Updates\n");
+  DEBUG(Clone->dump());
+}
+
+void VecClone::updateReturnBlockInstructions(Function *Clone,
+                                             BasicBlock *ReturnBlock,
+                                             Instruction *ExpandedReturn) {
+  // If the vector function returns void, then there is no need to do any
+  // packing. The only instruction in the ReturnBlock is 'ret void', so
+  // we can just leave this instruction and we're done.
+  if (Clone->getReturnType()->isVoidTy())
+    return;
+
+  // Collect all instructions in the return basic block. They will be removed.
+  SmallVector<Instruction *, 4> InstToRemove;
+  BasicBlock::iterator InstIt = ReturnBlock->begin();
+  BasicBlock::iterator InstEnd = ReturnBlock->end();
+
+  for (; InstIt != InstEnd; ++InstIt) {
+    InstToRemove.push_back(&*InstIt);
+  }
+
+  // Remove all instructions from the return block. These will be replaced
+  // with the instructions necessary to return a vector temp. The verifier
+  // will complain if we remove the definitions of users first, so remove
+  // instructions from the bottom up.
+  for (int I = InstToRemove.size() - 1; I >= 0; I--) {
+    InstToRemove[I]->eraseFromParent();
+  }
+
+  // Pack up the elements into a vector temp and return it. If the return
+  // vector was bitcast to a pointer to the element type, we must bitcast to
+  // vector before returning.
+  Instruction *Return;
+  if (dyn_cast<BitCastInst>(ExpandedReturn)) {
+    // Operand 0 is the actual alloc reference in the bitcast.
+    AllocaInst *Alloca = dyn_cast<AllocaInst>(ExpandedReturn->getOperand(0));
+    PointerType *PtrVecType = PointerType::get(
+        Clone->getReturnType(), Alloca->getType()->getAddressSpace());
+    BitCastInst *BitCast =
+        new BitCastInst(ExpandedReturn, PtrVecType,
+                        "vec." + ExpandedReturn->getName(), ReturnBlock);
+    Return = BitCast;
+  } else {
+    Return = ExpandedReturn;
+  }
+
+  LoadInst *VecReturn = new LoadInst(Return, "vec.ret", ReturnBlock);
+  ReturnInst::Create(Clone->getContext(), VecReturn, ReturnBlock);
+
+  DEBUG(dbgs() << "After Return Block Update\n");
+  DEBUG(Clone->dump());
+}
+
+int VecClone::getParmIndexInFunction(Function *F, Value *Parm) {
+  Function::arg_iterator ArgIt = F->arg_begin();
+  Function::arg_iterator ArgEnd = F->arg_end();
+  for (unsigned Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) {
+    if (Parm == &*ArgIt)
+      return Idx;
+  }
+
+  return -1;
+}
+
+bool VecClone::isSimpleFunction(Function *Clone, VectorVariant &V,
+                                ReturnInst *ReturnOnly) {
+  // For really simple functions, there is no need to go through the process
+  // of inserting a loop.
+
+  // Example:
+  //
+  // void foo(void) {
+  //   return;
+  // }
+  //
+  // No need to insert a loop for this case since it's basically a no-op. Just
+  // clone the function and return. It's possible that we could have some code
+  // inside of a vector function that modifies global memory. Let that case go
+  // through.
+  if (ReturnOnly && Clone->getReturnType()->isVoidTy()) {
+    return true;
+  }
+
+  return false;
+}
+
+void VecClone::insertSplitForMaskedVariant(Function *Clone,
+                                           BasicBlock *LoopBlock,
+                                           BasicBlock *LoopExitBlock,
+                                           Instruction *Mask, PHINode *Phi) {
+  BasicBlock *LoopThenBlock =
+      LoopBlock->splitBasicBlock(LoopBlock->getFirstNonPHI(), "simd.loop.then");
+
+  BasicBlock *LoopElseBlock = BasicBlock::Create(
+      Clone->getContext(), "simd.loop.else", Clone, LoopExitBlock);
+
+  BranchInst::Create(LoopExitBlock, LoopElseBlock);
+
+  BitCastInst *BitCast = dyn_cast<BitCastInst>(Mask);
+  PointerType *BitCastType = dyn_cast<PointerType>(BitCast->getType());
+  Type *PointeeType = BitCastType->getElementType();
+
+  GetElementPtrInst *MaskGep = GetElementPtrInst::Create(
+      PointeeType, Mask, Phi, "mask.gep", LoopBlock->getTerminator());
+
+  LoadInst *MaskLoad =
+      new LoadInst(MaskGep, "mask.parm", LoopBlock->getTerminator());
+
+  Type *CompareTy = MaskLoad->getType();
+  Instruction *MaskCmp;
+  Constant *Zero;
+
+  // Generate the compare instruction to see if the mask bit is on. In ICC, we
+  // use the movemask intrinsic which takes both float/int mask registers and
+  // converts to an integer scalar value, one bit representing each element.
+  // AVR construction will be complicated if this intrinsic is introduced here,
+  // so the current solution is to just generate either an integer or floating
+  // point compare instruction for now. This may change anyway if we decide to
+  // go to a vector of i1 values for the mask. I suppose this would be one
+  // positive reason to use vector of i1.
+  if (CompareTy->isIntegerTy()) {
+    Zero = getConstantValue(CompareTy, Clone->getContext(), 0);
+    MaskCmp = new ICmpInst(LoopBlock->getTerminator(), CmpInst::ICMP_NE,
+                           MaskLoad, Zero, "mask.cond");
+  } else if (CompareTy->isFloatingPointTy()) {
+    Zero = getConstantValue(CompareTy, Clone->getContext(), 0.0);
+    MaskCmp = new FCmpInst(LoopBlock->getTerminator(), CmpInst::FCMP_UNE,
+                           MaskLoad, Zero, "mask.cond");
+  } else {
+    assert(0 && "Unsupported mask compare");
+  }
+
+  TerminatorInst *Term = LoopBlock->getTerminator();
+  Term->eraseFromParent();
+  BranchInst::Create(LoopThenBlock, LoopElseBlock, MaskCmp, LoopBlock);
+
+  DEBUG(dbgs() << "After Split Insertion For Masked Variant\n");
+  DEBUG(Clone->dump());
+}
+
+void VecClone::removeScalarAllocasForVectorParams(
+    std::vector<ParmRef *> &VectorParmMap) {
+  std::vector<ParmRef *>::iterator VectorParmMapIt;
+
+  for (auto VectorParmMapIt : VectorParmMap) {
+    Value *Parm = VectorParmMapIt->VectorParm;
+    if (AllocaInst *ScalarAlloca = dyn_cast<AllocaInst>(Parm)) {
+      ScalarAlloca->eraseFromParent();
+    }
+  }
+}
+
+void VecClone::disableLoopUnrolling(BasicBlock *Latch) {
+  // Set disable unroll metadata on the conditional branch of the loop latch
+  // for the simd loop. The following is an example of what the loop latch
+  // and Metadata will look like. The !llvm.loop marks the beginning of the
+  // loop Metadata and is always placed on the terminator of the loop latch.
+  // (i.e., simd.loop.exit in this case). According to LLVM documentation, to
+  // properly set the loop Metadata, the 1st operand of !16 must be a self-
+  // reference to avoid some type of Metadata merging conflicts that have
+  // apparently arisen in the past. This is part of LLVM history that I do not
+  // know. Also, according to LLVM documentation, any Metadata nodes referring
+  // to themselves are marked as distinct. As such, all Metadata corresponding
+  // to a loop belongs to that loop alone and no sharing of Metadata can be
+  // done across different loops.
+  //
+  // simd.loop.exit:        ; preds = %simd.loop, %if.else, %if.then
+  //  %indvar = add nuw i32 %index, 1
+  //  %vl.cond = icmp ult i32 %indvar, 2
+  //  br i1 %vl.cond, label %simd.loop, label %simd.end.region, !llvm.loop !16
+  //
+  // !16 = distinct !{!16, !17}
+  // !17 = !{!"llvm.loop.unroll.disable"}
+
+  SmallVector<Metadata *, 4> MDs;
+
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+
+  // Add unroll(disable) metadata to disable future unrolling.
+  LLVMContext &Context = Latch->getContext();
+  SmallVector<Metadata *, 1> DisableOperands;
+  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  MDs.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  Latch->getTerminator()->setMetadata("llvm.loop", NewLoopID);
+}
+
+bool VecClone::runOnModule(Module &M) {
+
+  DEBUG(dbgs() << "\nExecuting SIMD Function Cloning ...\n\n");
+
+  std::map<Function *, std::vector<StringRef>> FunctionsToVectorize;
+  getFunctionsToVectorize(M, FunctionsToVectorize);
+
+  // VectorParmMap contains the mapping of the parameter to the bitcast
+  // instruction that casts the vector alloca for vector parameters to a scalar
+  // pointer for use in the simd loop. When parameters are registerized, the
+  // Value* in the map correponds directly to the function parameter. When
+  // parameters are not registerized, then the Value* in the map is the original
+  // scalar alloca before expansion. Later, users of the parameter, either
+  // directly or through the alloca, are replaced with a gep using the bitcast
+  // of the vector alloca for the parameter and the current loop induction
+  // variable value.
+  //
+  // IMPORTANT NOTE: std::vector was used here because later we emit LLVM
+  // instructions using the members of ParmRef, and these instructions should be
+  // ordered consistently for easier testability.
+
+  std::vector<ParmRef *> VectorParmMap;
+
+  std::map<Function *, std::vector<StringRef>>::iterator VarIt;
+  std::map<Function *, std::vector<StringRef>>::iterator VarEnd;
+  for (VarIt = FunctionsToVectorize.begin(),
+      VarEnd = FunctionsToVectorize.end();
+       VarIt != VarEnd; ++VarIt) {
+
+    Function &F = *(VarIt->first);
+    std::vector<StringRef> Variants = VarIt->second;
+    TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+    for (unsigned i = 0; i < Variants.size(); i++) {
+
+      VectorVariant Variant(Variants[i], TTI);
+
+      // Clone the original function.
+      DEBUG(dbgs() << "Before SIMD Function Cloning\n");
+      DEBUG(F.dump());
+      Function *Clone = CloneFunction(F, Variant);
+      Function::iterator EntryBlock = Clone->begin();
+      BasicBlock::iterator FirstInst = EntryBlock->begin();
+      ReturnInst *ReturnOnly = dyn_cast<ReturnInst>(FirstInst);
+
+      if (isSimpleFunction(Clone, Variant, ReturnOnly)) {
+        continue;
+      }
+
+      BasicBlock *LoopBlock = splitEntryIntoLoop(Clone, Variant, &*EntryBlock);
+      BasicBlock *ReturnBlock = splitLoopIntoReturn(Clone, &Clone->back());
+      BasicBlock *LoopExitBlock = createLoopExit(Clone, ReturnBlock);
+      PHINode *Phi = createPhiAndBackedgeForLoop(Clone, &*EntryBlock, LoopBlock,
+                                                 LoopExitBlock, ReturnBlock,
+                                                 Variant.getVlen());
+
+      // At this point, we've gathered some parameter information and have
+      // restructured the function into an entry block, a set of blocks
+      // forming the loop, a loop exit block, and a return block. Now,
+      // we can go through and update instructions since we know what
+      // is part of the loop.
+
+      // Create a new vector alloca instruction for all vector parameters and
+      // return. For parameters, replace the initial store to the old alloca
+      // with the vector one. Users of the old alloca within the loop will be
+      // replaced with a gep using this address along with the proper loop
+      // index.
+
+      Instruction *Mask = NULL;
+      Instruction *ExpandedReturn = expandVectorParametersAndReturn(
+          Clone, Variant, &Mask, &*EntryBlock, LoopBlock, ReturnBlock,
+          VectorParmMap);
+      updateScalarMemRefsWithVector(Clone, F, &*EntryBlock, ReturnBlock, Phi,
+                                    VectorParmMap);
+
+      // Update any linear variables with the appropriate stride. This function
+      // will insert a mul/add sequence before the use of the parameter. For
+      // linear pointer parameters, the stride calculation is just a mul
+      // instruction using the loop induction var and the stride value on the
+      // parameter. This mul instruction is then used as the index of the gep
+      // that will be inserted before the next use of the parameter. The
+      // function also updates the users of the parameter with the new
+      // calculation involving the stride.
+      updateLinearReferences(Clone, F, Variant, Phi);
+
+      // Remove the old scalar instructions associated with the return and
+      // replace with packing instructions.
+      updateReturnBlockInstructions(Clone, ReturnBlock, ExpandedReturn);
+
+      // Remove the old scalar allocas associated with vector parameters since
+      // these have now been replaced with vector ones.
+      removeScalarAllocasForVectorParams(VectorParmMap);
+
+      for (auto *Parm : VectorParmMap) {
+        delete Parm;
+      }
+      VectorParmMap.clear();
+
+      // If this is the masked vector variant, insert the mask condition and
+      // if/else blocks.
+      if (Variant.isMasked()) {
+        insertSplitForMaskedVariant(Clone, LoopBlock, LoopExitBlock, Mask, Phi);
+      }
+
+      DEBUG(dbgs() << "After SIMD Function Cloning\n");
+      DEBUG(Clone->dump());
+
+      // Disable unrolling from kicking in on the simd loop.
+      disableLoopUnrolling(LoopExitBlock);
+
+    } // End of function cloning for the variant
+  }   // End of function cloning for all variants
+
+  return true; // LLVM IR has been modified
+}
+
+void VecClone::print(raw_ostream &OS, const Module *M) const {
+  // TODO
+}
+
+ModulePass *llvm::createVecClonePass() { return new llvm::VecClone(); }
+
+char VecClone::ID = 0;
+
+static const char lv_name[] = "VecClone";
+INITIALIZE_PASS_BEGIN(VecClone, SV_NAME, lv_name, false /* modifies CFG */,
+                      false /* transform pass */)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(VecClone, SV_NAME, lv_name, false /* modififies CFG */,
+                    false /* transform pass */)
Index: test/Transforms/VecClone/all_parm_types.ll
===================================================================
--- test/Transforms/VecClone/all_parm_types.ll
+++ test/Transforms/VecClone/all_parm_types.ll
@@ -0,0 +1,46 @@
+; Test all different kinds of parameters (uniform, linear, vector), multiple uses of linear k, and that stride calculations can handle type conversions.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4uvl_dowork
+; CHECK: simd.loop:
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %stride.cast{{.*}} = sext i32 %stride.mul{{.*}}
+; CHECK: %stride.add{{.*}} = add i64 %k, %stride.cast{{.*}}
+; CHECK: %arrayidx = getelementptr inbounds float, float* %a, i64 %stride.add{{.*}}
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %stride.cast{{.*}} = bitcast i32 %stride.mul{{.*}} to float
+; CHECK: %stride.add{{.*}} = fadd float %conv, %stride.cast{{.*}}
+; CHECK: %add{{.*}} = fadd float %add, %stride.add{{.*}}
+
+; ModuleID = 'rfc.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define float @dowork(float* %a, float %b, i64 %k) #0 {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %k
+  %0 = load float, float* %arrayidx, align 4, !tbaa !2
+  %call = call float @sinf(float %0) #5
+  %add = fadd float %call, %b
+  %conv = sitofp i64 %k to float
+  %add1 = fadd float %add, %conv
+  ret float %add1
+}
+
+; Function Attrs: nounwind
+declare float @sinf(float) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4uvl_dowork,_ZGVcN8uvl_dowork,_ZGVdN8uvl_dowork,_ZGVeN16uvl_dowork,_ZGVbM4uvl_dowork,_ZGVcM8uvl_dowork,_ZGVdM8uvl_dowork,_ZGVeM16uvl_dowork" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 316400)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"float", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
Index: test/Transforms/VecClone/broadcast.ll
===================================================================
--- test/Transforms/VecClone/broadcast.ll
+++ test/Transforms/VecClone/broadcast.ll
@@ -0,0 +1,19 @@
+; Check broadcast of a constant. The store of the constant should be moved inside of the loop.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4_foo
+; CHECK: simd.loop:
+; CHECK: store i32 99, i32* %ret.cast.gep
+
+; ModuleID = 'foo.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo() #0 {
+entry:
+  ret i32 99
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4_foo,_ZGVbN4_foo,_ZGVcM8_foo,_ZGVcN8_foo,_ZGVdM8_foo,_ZGVdN8_foo,_ZGVeM16_foo,_ZGVeN16_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/convert_linear.ll
===================================================================
--- test/Transforms/VecClone/convert_linear.ll
+++ test/Transforms/VecClone/convert_linear.ll
@@ -0,0 +1,32 @@
+; Check handling of upconverting a linear (variable %i) to ensure stride calculation
+; is inserted correctly and the old convert (sext) uses the stride instead of the old
+; reference to %i.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN2vl_foo
+; CHECK: simd.loop:
+; CHECK: %0 = load i32, i32* %i.addr
+; CHECK-NEXT: %stride.mul = mul i32 1, %index
+; CHECK-NEXT: %stride.add = add i32 %0, %stride.mul
+; CHECK-NEXT: %conv = sext i32 %stride.add to i64
+
+; ModuleID = 'convert.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i64 @foo(i64 %x, i32 %i) #0 {
+entry:
+  %x.addr = alloca i64, align 8
+  %i.addr = alloca i32, align 4
+  store i64 %x, i64* %x.addr, align 8
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = load i64, i64* %x.addr, align 8
+  %add = add nsw i64 %conv, %1
+  ret i64 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM2vl_foo,_ZGVbN2vl_foo,_ZGVcM4vl_foo,_ZGVcN4vl_foo,_ZGVdM4vl_foo,_ZGVdN4vl_foo,_ZGVeM8vl_foo,_ZGVeN8vl_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/external_array.ll
===================================================================
--- test/Transforms/VecClone/external_array.ll
+++ test/Transforms/VecClone/external_array.ll
@@ -0,0 +1,35 @@
+; Check to see that we are applying the correct updated linear index for an external array access gep.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4ul_foo
+; CHECK: simd.loop:
+; CHECK: %1 = load i32, i32* %i.addr
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK: %stride.add = add i32 %1, %stride.mul
+; CHECK: %idxprom = sext i32 %stride.add to i64
+; CHECK: %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom
+; CHECK: store i32 %0, i32* %arrayidx
+
+; ModuleID = 'external_array_assign.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@ext_a = common global [128 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %i) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %i.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %i.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom
+  store i32 %0, i32* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "vector-variants"="_ZGVbM4ul_foo,_ZGVbN4ul_foo,_ZGVcM8ul_foo,_ZGVcN8ul_foo,_ZGVdM8ul_foo,_ZGVdN8ul_foo,_ZGVeM16ul_foo,_ZGVeN16ul_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/linear.ll
===================================================================
--- test/Transforms/VecClone/linear.ll
+++ test/Transforms/VecClone/linear.ll
@@ -0,0 +1,29 @@
+; Check to see that the linear parameter i is updated with the correct stride, indicated by a mul/add instruction sequence after the load.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4lu_foo
+; CHECK: simd.loop:
+; CHECK: %1 = load i32, i32* %i.addr
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK: %stride.add = add i32 %1, %stride.mul
+; CHECK: %add = add nsw i32 %0, %stride.add
+
+; ModuleID = 'linear.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %i, i32 %x) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %x, i32* %x.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %i.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/linear_mem2reg.ll
===================================================================
--- test/Transforms/VecClone/linear_mem2reg.ll
+++ test/Transforms/VecClone/linear_mem2reg.ll
@@ -0,0 +1,22 @@
+; Check to see that the linear parameter i is updated with the correct stride when Mem2Reg is on.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4lu_foo
+; CHECK: simd.loop:
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK-NEXT: %stride.add = add i32 %i, %stride.mul
+; CHECK-NEXT: %add = add nsw i32 %x, %stride.add
+
+;ModuleID = 'linear.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %i, i32 %x) #0 {
+entry:
+  %add = add nsw i32 %x, %i
+  ret i32 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/struct_linear_ptr.ll
===================================================================
--- test/Transforms/VecClone/struct_linear_ptr.ll
+++ test/Transforms/VecClone/struct_linear_ptr.ll
@@ -0,0 +1,40 @@
+; Test that the stride is being applied correctly to struct field accesses.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4l_foo
+; CHECK: simd.loop:
+; CHECK: %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %0, i32 %stride.mul{{.*}}
+; CHECK: %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 0
+; CHECK: %1 = load float, float* %field1, align 8
+; CHECK: %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %2, i32 %stride.mul{{.*}}
+; CHECK: %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 4
+; CHECK: %3 = load float, float* %field5, align 8
+; CHECK: %add = fadd float %1, %3
+
+; ModuleID = 'struct_linear_ptr.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.my_struct = type { float, i8, i32, i16, float, i64 }
+
+; Function Attrs: nounwind uwtable
+define float @foo(%struct.my_struct* %s) #0 {
+entry:
+  %s.addr = alloca %struct.my_struct*, align 8
+  store %struct.my_struct* %s, %struct.my_struct** %s.addr, align 8
+  %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+  %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %0, i32 0, i32 0
+  %1 = load float, float* %field1, align 8
+  %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+  %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %2, i32 0, i32 4
+  %3 = load float, float* %field5, align 8
+  %add = fadd float %1, %3
+  ret float %add
+}
+
+attributes #0 = { norecurse nounwind readonly uwtable "vector-variants"="_ZGVbM4l_foo,_ZGVbN4l_foo,_ZGVcM8l_foo,_ZGVcN8l_foo,_ZGVdM8l_foo,_ZGVdN8l_foo,_ZGVeM16l_foo,_ZGVeN16l_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum.ll
+++ test/Transforms/VecClone/two_vec_sum.ll
@@ -0,0 +1,59 @@
+; Do a sanity check on the structure of the LLVM that VecClone produces for the non-masked variant.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; Begin non-masked variant checking
+; NOTE: This test checks order very strictly and can change depending on optimization level used.
+;       FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced.
+
+; CHECK-LABEL: <4 x i32> @_ZGVbN4vv_vec_sum(<4 x i32> %i, <4 x i32> %j)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %vec.i = alloca <4 x i32>
+; CHECK-NEXT: %vec.j = alloca <4 x i32>
+; CHECK-NEXT: %vec.retval = alloca <4 x i32>
+; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i
+; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j
+; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32*
+; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32*
+; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32*
+; CHECK-NEXT: br label %simd.loop
+
+; CHECK: simd.loop:
+; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4
+; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4
+; CHECK-NEXT: %add = add nsw i32 %0, %1
+; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.exit:
+; CHECK-NEXT: %indvar = add nuw i32 %index, 1
+; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4
+; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return
+
+; CHECK: return:
+; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>*
+; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast
+; CHECK-NEXT: ret <4 x i32> %vec.ret
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum_mask.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum_mask.ll
+++ test/Transforms/VecClone/two_vec_sum_mask.ll
@@ -0,0 +1,71 @@
+; Do a sanity check on the structure of the LLVM that VecClone produces for the masked variant.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+; NOTE: This test checks order very strictly and can change depending on optimization level used.
+;       FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced.
+
+; Begin non-masked variant checking
+
+; CHECK-LABEL: <4 x i32> @_ZGVbM4vv_vec_sum(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %vec.i = alloca <4 x i32>
+; CHECK-NEXT: %vec.j = alloca <4 x i32>
+; CHECK-NEXT: %vec.mask = alloca <4 x i32>
+; CHECK-NEXT: %vec.retval = alloca <4 x i32>
+; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i, align 4
+; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j, align 4
+; CHECK-NEXT: store <4 x i32> %mask, <4 x i32>* %vec.mask
+; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32*
+; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32*
+; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32*
+; CHECK-NEXT: %mask.cast = bitcast <4 x i32>* %vec.mask to i32*
+; CHECK-NEXT: br label %simd.loop
+
+; CHECK: simd.loop:
+; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+; CHECK-NEXT: %mask.gep = getelementptr i32, i32* %mask.cast, i32 %index
+; CHECK-NEXT: %mask.parm = load i32, i32* %mask.gep
+; CHECK-NEXT: %mask.cond = icmp ne i32 %mask.parm, 0
+; CHECK-NEXT: br i1 %mask.cond, label %simd.loop.then, label %simd.loop.else
+
+; CHECK: simd.loop.then:
+; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4
+; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4
+; CHECK-NEXT: %add = add nsw i32 %0, %1
+; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.else:
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.exit:
+; CHECK-NEXT: %indvar = add nuw i32 %index, 1
+; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4
+; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return
+
+; CHECK: return:
+; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>*
+; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast
+; CHECK-NEXT: ret <4 x i32> %vec.ret
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum_mem2reg.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum_mem2reg.ll
+++ test/Transforms/VecClone/two_vec_sum_mem2reg.ll
@@ -0,0 +1,31 @@
+; Check to be sure that when Mem2Reg is on that all updates to instructions referring to the original
+; parameter are updated correctly. When Mem2Reg is on, instructions will refer to the parameters
+; directly and not through a load, which is why this is tested separately.
+
+; Note: the LLVM IR used as input to this test has already had Mem2Reg applied to it, so no need to
+; do that here. This happens at higher optimization levels such as -O2.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; Begin non-masked variant checking
+
+; CHECK-LABEL: @_ZGVbN4vv_vec_sum
+; CHECK: simd.loop:
+; CHECK: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK: %vec.i.elem = load i32, i32* %vec.i.cast.gep
+; CHECK: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK: %vec.j.elem = load i32, i32* %vec.j.cast.gep
+; CHECK: %add = add nsw i32 %vec.i.elem, %vec.j.elem
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %add = add nsw i32 %i, %j
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/uniform.ll
===================================================================
--- test/Transforms/VecClone/uniform.ll
+++ test/Transforms/VecClone/uniform.ll
@@ -0,0 +1,25 @@
+; Check to make sure the initial parameter store of the uniform parameter is sunk into the loop.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: <4 x i32> @_ZGVbN4u_foo(i32 %b)
+; CHECK: simd.loop:
+; CHECK: store i32 %b
+
+; ModuleID = 'uniform.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %b) #0 {
+entry:
+  %b.addr = alloca i32, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32, i32* %b.addr, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %b.addr, align 4
+  %1 = load i32, i32* %b.addr, align 4
+  ret i32 %1
+}
+
+attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4u_foo,_ZGVbN4u_foo,_ZGVcM8u_foo,_ZGVcN8u_foo,_ZGVdM8u_foo,_ZGVdN8u_foo,_ZGVeM16u_foo,_ZGVeN16u_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/vector_ptr.ll
===================================================================
--- test/Transforms/VecClone/vector_ptr.ll
+++ test/Transforms/VecClone/vector_ptr.ll
@@ -0,0 +1,25 @@
+; Test that vector of pointers are handled with correctly in loop and that incompatible function return/arg attributes are removed.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN2v_dowork
+; CHECK: simd.loop:
+; CHECK: %vec.p.cast.gep = getelementptr float*, float** %vec.p.cast, i32 %index
+; CHECK: %vec.p.elem = load float*, float** %vec.p.cast.gep
+; CHECK: %add.ptr = getelementptr inbounds float, float* %vec.p.elem, i64 1
+; CHECK: %ret.cast.gep = getelementptr float*, float** %ret.cast, i32 %index
+; CHECK: store float* %add.ptr, float** %ret.cast.gep
+
+source_filename = "vector_ptr.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define nonnull float* @dowork(float* readnone %p) local_unnamed_addr #0 {
+entry:
+  %add.ptr = getelementptr inbounds float, float* %p, i64 1
+  ret float* %add.ptr
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN2v_dowork,_ZGVcN4v_dowork,_ZGVdN4v_dowork,_ZGVeN8v_
+dowork,_ZGVbM2v_dowork,_ZGVcM4v_dowork,_ZGVdM4v_dowork,_ZGVeM8v_dowork" }
Index: test/Transforms/VecClone/void_foo.ll
===================================================================
--- test/Transforms/VecClone/void_foo.ll
+++ test/Transforms/VecClone/void_foo.ll
@@ -0,0 +1,19 @@
+; Check to make sure we can handle void foo() function
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: void @_ZGVbN4_foo()
+; CHECK: entry:
+; CHECK: ret void
+
+; ModuleID = 'foo.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @foo() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4_foo1,_ZGVbN4_foo1,_ZGVcM8_foo1,_ZGVcN8_foo1,_ZGVdM8_foo1,_ZGVdN8_foo1,_ZGVeM16_foo1,_ZGVeN16_foo1" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: tools/bugpoint/bugpoint.cpp
===================================================================
--- tools/bugpoint/bugpoint.cpp
+++ tools/bugpoint/bugpoint.cpp
@@ -134,6 +134,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeVecClonePass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -389,6 +389,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeVecClonePass(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeScalarizeMaskedMemIntrinPass(Registry);