Index: include/llvm/Analysis/VectorUtils.h
===================================================================
--- include/llvm/Analysis/VectorUtils.h
+++ include/llvm/Analysis/VectorUtils.h
@@ -27,6 +27,7 @@
 class TargetTransformInfo;
 class Type;
 class Value;
+class VectorVariant;
 
 namespace Intrinsic {
 enum ID : unsigned;
@@ -123,6 +124,26 @@
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// \brief Contains the names of the declared vector function variants
+typedef std::vector<std::string> DeclaredVariants;
+
+/// \brief Contains a mapping of a function to its vector function variants
+typedef std::map<Function*, DeclaredVariants> FunctionVariants;
+
+/// \brief Get all function attributes that specify a vector variant.
+std::vector<Attribute> getVectorVariantAttributes(Function& F);
+
+/// \brief Determine the characteristic type of the vector function as
+/// specified according to the vector function ABI.
+Type* calcCharacteristicType(Function& F, VectorVariant& Variant);
+
+/// \brief Get all functions marked for vectorization in module.
+void getFunctionsToVectorize(Module &M, FunctionVariants& funcVars);
+
+/// \brief Returns a floating point or integer constant depending on Ty.
+template <typename T>
+Constant* getConstantValue(Type *Ty, LLVMContext &Context, T Val);
+
 } // llvm namespace
 
 #endif
Index: include/llvm/Analysis/VectorVariant.h
===================================================================
--- include/llvm/Analysis/VectorVariant.h
+++ include/llvm/Analysis/VectorVariant.h
@@ -0,0 +1,387 @@
+//===- llvm/Transforms/Utils/VectorVariant.h - Vector utilities -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This header file defines the VectorVariant class and implements the encoding
+/// and decoding utilities for VectorVariant objects. Multiple VectorVariant
+/// objects can be created (masked, non-masked, etc.) and associated with the
+/// original scalar function. These objects are then used to clone new functions
+/// that can be vectorized. This class follows the standards defined in the
+/// vector function ABI.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_VECTORVARIANT_H
+#define LLVM_ANALYSIS_VECTORVARIANT_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include <vector>
+#include <sstream>
+#include <cctype>
+
+using namespace llvm;
+
+namespace llvm {
+
+#define STRIDE_KIND 's'
+#define LINEAR_KIND 'l'
+#define UNIFORM_KIND 'u'
+#define VECTOR_KIND 'v'
+
+#define NOT_ALIGNED 1
+
+#define POSITIVE 1
+#define NEGATIVE -1
+
+class VectorKind {
+
+public:
+  VectorKind(char K, int S, int A = NOT_ALIGNED) {
+
+    assert((S == notAValue() || K == STRIDE_KIND || K == LINEAR_KIND) &&
+           "only linear vectors have strides");
+
+    assert((K != LINEAR_KIND || S != notAValue()) &&
+           "linear vectors must have a stride");
+
+    assert((K != STRIDE_KIND || S != notAValue()) &&
+           "variable stride vectors must have a stride");
+
+    assert((K != STRIDE_KIND || S >= 0) &&
+           "variable stride position must be non-negative");
+
+    assert(A > 0 && "alignment must be positive");
+
+    Kind = K;
+    Stride = S;
+    Alignment = A;
+  }
+
+  VectorKind(const VectorKind &Other) {
+    Kind = Other.Kind;
+    Stride = Other.Stride;
+    Alignment = Other.Alignment;
+  }
+
+  /// \brief Is the stride for a linear parameter a uniform variable? (i.e.,
+  /// the stride is stored in a variable but is uniform)
+  bool isVariableStride() { return Kind == STRIDE_KIND; }
+
+  /// \brief Is the stride for a linear variable non-unit stride?
+  bool isNonUnitStride() { return Kind == LINEAR_KIND && Stride != 1; }
+
+  /// \brief Is the stride for a linear variable unit stride?
+  bool isUnitStride() { return Kind == LINEAR_KIND && Stride == 1; }
+
+  /// \brief Is this a linear parameter?
+  bool isLinear() {
+    return isVariableStride() || isNonUnitStride() || isUnitStride();
+  }
+
+  /// \brief Is this a uniform parameter?
+  bool isUniform() { return Kind == UNIFORM_KIND; }
+
+  /// \brief Is this a vector parameter?
+  bool isVector() { return Kind == VECTOR_KIND; }
+
+  /// \brief Is the parameter aligned?
+  bool isAligned() { return Alignment != NOT_ALIGNED; }
+
+  /// \brief Get the stride associated with a linear parameter.
+  int getStride() { return Stride; }
+
+  /// \brief Get the alignment associated with a linear parameter.
+  int getAlignment() { return Alignment; }
+
+  /// \brief Represents a don't care value for strides of parameters other
+  /// than linear parameters.
+  static int notAValue() { return -1; }
+
+  /// \brief Encode the parameter information into a mangled string
+  /// corresponding to the standards defined in the vector function ABI.
+  std::string encode() {
+    std::stringstream SST;
+    SST << Kind;
+
+    if (isNonUnitStride()) {
+      if (Stride >= 0)
+        SST << Stride;
+      else
+        SST << "n" << -Stride;
+    }
+
+    if (isVariableStride())
+      SST << Stride;
+
+    if (isAligned())
+      SST << 'a' << Alignment;
+
+    return SST.str();
+  }
+
+private:
+  char Kind;      // linear, uniform, vector
+  int  Stride;
+  int  Alignment;
+};
+
+class VectorVariant {
+private:
+  // Targets defined in the vector function ABI.
+  enum TargetProcessor {
+    Pentium4,      // ISA extension = SSE2,     ISA class = XMM
+    Pentium4SSE3,  // ISA extension = SSE3,     ISA class = XMM
+    Core2DuoSSSE3, // ISA extension = SSSE3,    ISA class = XMM
+    Core2DuoSSE41, // ISA extension = SSE4_1,   ISA class = XMM
+    CoreI7SSE42,   // ISA extension = SSE4_2,   ISA class = XMM
+    Core2ndGenAVX, // ISA extension = AVX,      ISA class = YMM1
+    Core3rdGenAVX, // ISA extension = AVX,      ISA class = YMM1
+    Core4thGenAVX, // ISA extension = AVX2,     ISA class = YMM2
+    Mic,           // ISA extension = Xeon Phi, ISA class = MIC(ZMM)
+    FutureCpu22,   // ISA extension = AVX512,   ISA class = ZMM
+    FutureCpu23,   // ISA extension = AVX512,   ISA class = ZMM
+  };
+
+  // ISA classes defined in the vector function ABI.
+  enum ISAClass {
+    XMM,  // (SSE2)
+    YMM1, // (AVX1)
+    YMM2, // (AVX2)
+    ZMM,  // (MIC)
+    ISAClassesNum
+  };
+
+  ISAClass Isa;
+  bool Mask;
+  unsigned int Vlen;
+  std::vector<VectorKind> Parameters;
+
+  static std::string prefix() { return "_ZGV"; }
+
+  /// \brief Determine the maximum vector register width based on the ISA classes
+  /// defined in the vector function ABI.
+  static unsigned int maximumSizeofISAClassVectorRegister(ISAClass I, Type *Ty);
+
+public:
+  VectorVariant(ISAClass I, bool M, unsigned int V,
+                const std::vector<VectorKind> &P)
+      : Isa(I), Mask(M), Vlen(V), Parameters(P) {
+    if (Mask) {
+      // Masked variants will have an additional mask parameter
+      VectorKind VKind(VECTOR_KIND, VectorKind::notAValue());
+      Parameters.push_back(VKind);
+    }
+  }
+
+  VectorVariant(const VectorVariant &Other)
+      : Isa(Other.Isa), Mask(Other.Mask), Vlen(Other.Vlen),
+        Parameters(Other.Parameters) {}
+
+  VectorVariant(StringRef FuncName);
+
+  /// \brief Get the ISA corresponding to this vector variant.
+  ISAClass getISA() { return Isa; }
+
+  /// \brief Is this a masked vector function variant?
+  bool isMasked() { return Mask; }
+
+  /// \brief Get the vector length of the vector variant.
+  unsigned int getVlen() { return Vlen; }
+
+  /// \brief Get the parameters of the vector variant.
+  std::vector<VectorKind> &getParameters() { return Parameters; }
+
+  /// \brief Build the mangled name for the vector variant. This function
+  /// builds a mangled name by including the encodings for the ISA class,
+  /// mask information, and all parameters.
+  std::string encode() {
+
+    std::stringstream SST;
+    SST << prefix() << encodeISAClass(Isa) << encodeMask(Mask) << Vlen;
+
+    std::vector<VectorKind>::iterator It = Parameters.begin();
+    std::vector<VectorKind>::iterator End = Parameters.end();
+
+    if (isMasked())
+      End--; // mask parameter is not encoded
+
+    for (; It != End; ++It)
+      SST << (*It).encode();
+
+    SST << "_";
+
+    return SST.str();
+  }
+
+  /// \brief Generate a function name corresponding to a vector variant.
+  std::string generateFunctionName(StringRef ScalarFuncName) {
+
+    static StringRef ManglingPrefix("_Z");
+    std::string Name = encode();
+
+    if (ScalarFuncName.startswith(ManglingPrefix))
+      return Name + ScalarFuncName.drop_front(ManglingPrefix.size()).str();
+    else
+      return Name + ScalarFuncName.str();
+  }
+
+  /// \brief Some targets do not support particular types, so promote to a type
+  /// that is supported.
+  Type *promoteToSupportedType(Type *Ty) {
+    return promoteToSupportedType(Ty, *this);
+  }
+
+  /// \brief Check to see if this is a vector variant based on the function
+  /// name.
+  static bool isVectorVariant(StringRef FuncName) {
+    return FuncName.startswith(prefix());
+  }
+
+  /// \brief Some targets do not support particular types, so promote to a type
+  /// that is supported.
+  static Type *promoteToSupportedType(Type *Ty, VectorVariant &Variant) {
+    ISAClass I;
+
+    I = Variant.getISA();
+    // On ZMM promote char and short to int
+    if (I == ISAClass::ZMM && (Ty->isIntegerTy(8) || Ty->isIntegerTy(16)))
+      return Type::getInt32Ty(Ty->getContext());
+
+    return Ty;
+  }
+
+  /// \brief Get the ISA class corresponding to a particular target processor.
+  static ISAClass targetProcessorISAClass(TargetProcessor Target) {
+
+    switch (Target) {
+    case Pentium4:
+    case Pentium4SSE3:
+    case Core2DuoSSSE3:
+    case Core2DuoSSE41:
+    case CoreI7SSE42:
+      return XMM;
+    case Core2ndGenAVX:
+    case Core3rdGenAVX:
+      return YMM1;
+    case Core4thGenAVX:
+      return YMM2;
+    case Mic:
+    case FutureCpu22:
+    case FutureCpu23:
+      return ZMM;
+    default:
+      llvm_unreachable("unsupported target processor");
+      return XMM;
+    }
+  }
+
+  /// \brief Get an ISA class string based on ISA class enum.
+  static std::string ISAClassToString(ISAClass IsaClass) {
+    switch (IsaClass) {
+    case XMM:
+      return "XMM";
+    case YMM1:
+      return "YMM1";
+    case YMM2:
+      return "YMM2";
+    case ZMM:
+      return "ZMM";
+    default:
+      assert(false && "unsupported ISA class");
+      return "?";
+    }
+  }
+
+  /// \brief Get an ISA class enum based on ISA class string.
+  static ISAClass ISAClassFromString(std::string IsaClass) {
+    if (IsaClass == "XMM")
+      return XMM;
+    if (IsaClass == "YMM1")
+      return YMM1;
+    if (IsaClass == "YMM2")
+      return YMM2;
+    if (IsaClass == "ZMM")
+      return ZMM;
+    assert(false && "unsupported ISA class");
+    return ISAClassesNum;
+  }
+
+  /// \brief Encode the ISA class for the mangled variant name.
+  static char encodeISAClass(ISAClass IsaClass) {
+
+    switch (IsaClass) {
+    case XMM:
+      return 'b';
+    case YMM1:
+      return 'c';
+    case YMM2:
+      return 'd';
+    case ZMM:
+      return 'e';
+    default:
+      break;
+    }
+
+    assert(false && "unsupported ISA class");
+    return '?';
+  }
+
+  /// \brief Decode the ISA class from the mangled variant name.
+  static ISAClass decodeISAClass(char IsaClass) {
+
+    switch (IsaClass) {
+    case 'b':
+      return XMM;
+    case 'c':
+      return YMM1;
+    case 'd':
+      return YMM2;
+    case 'e':
+      return ZMM;
+    default:
+      llvm_unreachable("unsupported ISA class");
+      return XMM;
+    }
+  }
+
+  /// \brief Encode the mask information for the mangled variant name.
+  static char encodeMask(bool EncodeMask) {
+
+    switch (EncodeMask) {
+    case true:
+      return 'M';
+    case false:
+      return 'N';
+    }
+
+    llvm_unreachable("unsupported mask");
+  }
+
+  /// \brief Decode the mask information from the mangled variant name.
+  static bool decodeMask(char MaskToDecode) {
+
+    switch (MaskToDecode) {
+    case 'M':
+      return true;
+    case 'N':
+      return false;
+    }
+
+    llvm_unreachable("unsupported mask");
+  }
+
+  /// \brief Calculate the vector length for the vector variant.
+  static unsigned int calcVlen(ISAClass I, Type *Ty);
+};
+
+} // llvm namespace
+
+#endif // LLVM_ANALYSIS_VECTORVARIANT_H
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -338,6 +338,7 @@
 void initializeWinEHPreparePass(PassRegistry&);
 void initializeWriteBitcodePassPass(PassRegistry &);
 void initializeXRayInstrumentationPass(PassRegistry &);
+void initializeVecClonePass(PassRegistry &);
 }
 
 #endif
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -46,6 +46,7 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/VecClone.h"
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/Support/Valgrind.h"
 #include <cstdlib>
@@ -198,6 +199,7 @@
       (void) llvm::createMemDerefPrinter();
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
+      (void) llvm::createVecClonePass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
Index: include/llvm/Transforms/Utils/VecClone.h
===================================================================
--- include/llvm/Transforms/Utils/VecClone.h
+++ include/llvm/Transforms/Utils/VecClone.h
@@ -0,0 +1,210 @@
+//===-------------- VecClone.h - Class definition -*- C++ -*---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===--------------------------------------------------------------------=== //
+///
+/// \file
+/// This file defines the VecClone pass class.
+///
+// ===--------------------------------------------------------------------=== //
+
+#include "llvm/Analysis/VectorVariant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#ifndef LLVM_TRANSFORMS_UTILS_VECCLONE_H
+#define LLVM_TRANSFORMS_UTILS_VECCLONE_H
+
+namespace llvm {
+
+class ModulePass;
+
+/// \brief Represents the mapping of a vector parameter to its corresponding
+/// vector to scalar type cast instruction. This done so that the scalar loop
+/// inserted by this pass contains instructions that are in scalar form so that
+/// the loop can later be vectorized.
+struct ParmRef {
+  // Represents the parameter in one of two forms:
+  // 1) A vector alloca instruction if the parameter has not been registerized.
+  // 2) The parameter as the Value* passed in via the function call.
+  Value *VectorParm;
+
+  // Represents the vector parameter cast from a vector type to scalar type.
+  Instruction *VectorParmCast;
+};
+
+class VecClone : public ModulePass {
+
+  private:
+
+    /// \brief Return true if the function has a complex type for the return
+    /// or parameters.
+    bool hasComplexType(Function *F);
+
+    /// \brief Make a copy of the function if it is marked as SIMD.
+    Function* cloneFunction(Function &F, VectorVariant &V);
+
+    /// \brief Take the entry basic block for the function as split off a second
+    /// basic block that will form the loop entry.
+    BasicBlock* splitEntryIntoLoop(Function *Clone, VectorVariant &V,
+                                   BasicBlock *EntryBlock);
+
+    /// \brief Take the loop entry basic block and split off a second basic
+    /// block into a new return basic block.
+    BasicBlock* splitLoopIntoReturn(Function *Clone, BasicBlock *LoopBlock);
+
+    /// \brief Generate a basic block to test the loop exit condition.
+    BasicBlock* createLoopExit(Function *Clone, BasicBlock *ReturnBlock);
+
+    /// \brief Update the predecessors of the return basic block.
+    void updateReturnPredecessors(Function *Clone, BasicBlock *LoopExitBlock,
+                                  BasicBlock *ReturnBlock);
+
+    /// \brief Create the backedge from the loop exit basic block to the loop
+    /// entry block.
+    PHINode* createPhiAndBackedgeForLoop(Function *Clone,
+                                         BasicBlock *EntryBlock,
+                                         BasicBlock *LoopBlock,
+                                         BasicBlock *LoopExitBlock,
+                                         BasicBlock *ReturnBlock,
+                                         int VL);
+
+    /// \brief Generate vector alloca instructions for vector parameters and
+    /// change the parameter types to vector types. Expand the return value of
+    /// the function to a vector type. This function returns the instruction
+    /// corresponding to the expanded return and the instruction corresponding
+    /// to the mask.
+    Instruction* expandVectorParametersAndReturn(
+        Function *Clone,
+        VectorVariant &V,
+        Instruction **Mask,
+        BasicBlock *EntryBlock,
+        BasicBlock *LoopBlock,
+        BasicBlock *ReturnBlock,
+        std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Expand the function parameters to vector types. This function
+    /// returns the instruction corresponding to the mask.
+    Instruction* expandVectorParameters(
+        Function *Clone,
+        VectorVariant &V,
+        BasicBlock *EntryBlock,
+        std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Expand the function's return value to a vector type.
+    Instruction* expandReturn(Function *Clone, BasicBlock *EntryBlock,
+                             BasicBlock *LoopBlock, BasicBlock *ReturnBlock,
+                             std::vector<ParmRef*>& ParmMap);
+
+    /// \brief Update the old parameter references to with the new vector
+    /// references.
+    void updateScalarMemRefsWithVector(
+        Function *Clone,
+        Function &F,
+        BasicBlock *EntryBlock,
+        BasicBlock *ReturnBlock,
+        PHINode *Phi,
+        std::vector<ParmRef*>& ParmMap);
+        
+    /// \brief Update the values of linear parameters by adding the stride
+    /// before the use.
+    void updateLinearReferences(Function *Clone, Function &F,
+                                VectorVariant &V, PHINode *Phi);
+
+    /// \brief Update the instructions in the return basic block to return a
+    /// vector temp.
+    void updateReturnBlockInstructions(Function *Clone, BasicBlock *ReturnBlock,
+                                       Instruction *VecReturnAlloca);
+
+    /// \brief Create a separate basic block to mark the begin and end of the
+    /// SIMD loop formed from the vector function. Essentially, this function
+    /// transfers the information from the SIMD function keywords and creates
+    /// new loop pragmas so that parameter information can be transferred to
+    /// the loop.
+    void insertDirectiveIntrinsics(Module& M, Function *Clone, Function &F,
+                                   VectorVariant &V,
+                                   BasicBlock *EntryBlock, 
+                                   BasicBlock *LoopExitBlock,
+                                   BasicBlock *ReturnBlock);
+
+    /// \brief Create the basic block indicating the begin of the SIMD loop.
+    void insertBeginRegion(Module& M, Function *Clone, Function &F,
+                           VectorVariant &V, BasicBlock *EntryBlock);
+
+    /// \brief Create the basic block indicating the end of the SIMD loop.
+    void insertEndRegion(Module& M, Function *Clone, BasicBlock *LoopExitBlock,
+                         BasicBlock *ReturnBlock);
+
+    /// \brief Create a new vector alloca instruction for the return vector and
+    /// bitcast to the appropriate element type.
+    Instruction* createExpandedReturn(Function *F, BasicBlock *BB,
+                                      VectorType *ReturnType);
+
+    /// \brief Return the position of the parameter in the function's parameter
+    /// list.
+    int getParmIndexInFunction(Function *F, Value *Parm);
+
+    /// \brief Check to see if the function is simple enough that a loop does
+    /// not need to be inserted into the function.
+    bool isSimpleFunction(Function *Clone, VectorVariant &V,
+                          ReturnInst *Return);
+
+    /// \brief Inserts the if/else split and mask condition for masked SIMD
+    /// functions.
+    void insertSplitForMaskedVariant(Function *Clone, BasicBlock *LoopBlock,
+                                     BasicBlock *LoopExitBlock,
+                                     Instruction *Mask, PHINode *Phi);
+
+    /// \brief Utility function to insert instructions with other instructions
+    /// of the same kind.
+    void insertInstruction(Instruction *Inst, BasicBlock *BB);
+
+    /// \brief Utility function that generates instructions that calculate the
+    /// stride for a linear parameter.
+    Instruction* generateStrideForParameter(Function *Clone, Argument *Arg,
+                                            Instruction *ParmUser, int Stride,
+                                            PHINode *Phi);
+
+    /// \brief Utility function that returns true if Inst is a store of a vector
+    /// or linear parameter.
+    bool isVectorOrLinearParamStore(Function *Clone,
+                                    std::vector<VectorKind> &ParmKinds,
+                                    Instruction *Inst);
+
+    /// \brief Removes the original scalar alloca instructions that correspond
+    /// to a vector parameter before widening.
+    void removeScalarAllocasForVectorParams(
+      std::vector<ParmRef*> &VectorParmMap);
+
+    /// \brief Adds metadata to the conditional branch of the simd loop latch to
+    /// prevent loop unrolling.
+    void disableLoopUnrolling(BasicBlock *Latch);
+
+    /// \brief Check to see that the type of the gep used for a load instruction
+    /// is compatible with the type needed as the result of the load. Basically,
+    /// check the validity of the LLVM IR to make sure that proper pointer
+    /// dereferencing is done.
+    bool typesAreCompatibleForLoad(Type *GepType, Type *LoadType);
+
+    bool runOnModule(Module &M) override;
+
+  public:
+
+    static char ID;
+    VecClone();
+    void print(raw_ostream &OS, const Module * = nullptr) const override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+}; // end pass class
+
+ModulePass *createVecClonePass();
+
+} // end llvm namespace
+
+#endif // LLVM_TRANSFORMS_UTILS_VECCLONE_H
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -79,6 +79,7 @@
   ScopedNoAliasAA.cpp
   ValueTracking.cpp
   VectorUtils.cpp
+  VectorVariant.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
Index: lib/Analysis/VectorUtils.cpp
===================================================================
--- lib/Analysis/VectorUtils.cpp
+++ lib/Analysis/VectorUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/VectorVariant.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
@@ -488,3 +489,106 @@
 
   return Inst;
 }
+
+std::vector<Attribute> llvm::getVectorVariantAttributes(Function& F) {
+  std::vector<Attribute> RetVal;
+  AttributeSet Attributes = F.getAttributes().getFnAttributes();
+  AttributeSet::iterator ItA = Attributes.begin(0);
+  AttributeSet::iterator EndA = Attributes.end(0);
+  for (; ItA != EndA; ++ItA) {
+    if (!ItA->isStringAttribute())
+      continue;
+    StringRef AttributeKind = ItA->getKindAsString();
+    if (VectorVariant::isVectorVariant(AttributeKind))
+      RetVal.push_back(*ItA);
+  }
+  return RetVal;
+}
+
+Type* llvm::calcCharacteristicType(Function& F, VectorVariant& Variant) {
+  Type* ReturnType = F.getReturnType();
+  Type* CharacteristicDataType = NULL;
+
+  if (!ReturnType->isVoidTy())
+    CharacteristicDataType = ReturnType;
+
+  if (!CharacteristicDataType) {
+
+    std::vector<VectorKind>& ParmKinds = Variant.getParameters();
+    const Function::ArgumentListType& Args = F.getArgumentList();
+    Function::ArgumentListType::const_iterator ArgIt = Args.begin();
+    Function::ArgumentListType::const_iterator ArgEnd = Args.end();
+    std::vector<VectorKind>::iterator VKIt = ParmKinds.begin();
+
+    for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) {
+      if (VKIt->isVector()) {
+        CharacteristicDataType = (*ArgIt).getType();
+        break;
+      }
+    }
+  }
+
+  // TODO except Clang's ComplexType
+  if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) {
+    CharacteristicDataType = Type::getInt32Ty(F.getContext());
+  }
+
+  // Promote char/short types to int for Xeon Phi.
+  CharacteristicDataType =
+    VectorVariant::promoteToSupportedType(CharacteristicDataType, Variant);
+
+  if (CharacteristicDataType->isPointerTy()) {
+    // For such cases as 'int* foo(int x)', where x is a non-vector type, the
+    // characteristic type at this point will be i32*. If we use the DataLayout
+    // to query the supported pointer size, then a promotion to i64* is
+    // incorrect because the mask element type will mismatch the element type
+    // of the characteristic type.
+    PointerType *PointerTy = cast<PointerType>(CharacteristicDataType);
+    CharacteristicDataType = PointerTy->getElementType();
+  }
+
+  return CharacteristicDataType;
+}
+
+void llvm::getFunctionsToVectorize(llvm::Module &M,
+                                   FunctionVariants& FuncVars) {
+  for (auto It = M.begin(), End = M.end(); It != End; ++It) {
+    Function& F = *It;
+    auto VariantAttributes = getVectorVariantAttributes(F);
+    if (VariantAttributes.empty())
+      continue;
+    FuncVars[&F] = DeclaredVariants();
+    DeclaredVariants& DeclaredFuncVariants = FuncVars[&F];
+    for (auto Attr : VariantAttributes)
+      DeclaredFuncVariants.push_back(Attr.getKindAsString());
+  }
+}
+
+template Constant *
+llvm::getConstantValue<int>(Type *Ty, LLVMContext &Context, int Val);
+template Constant *
+llvm::getConstantValue<float>(Type *Ty, LLVMContext &Context, float Val);
+template Constant *
+llvm::getConstantValue<double>(Type *Ty, LLVMContext &Context, double Val);
+
+template <typename T>
+Constant *llvm::getConstantValue(Type *Ty, LLVMContext &Context, T Val) {
+  Constant *ConstVal = nullptr;
+
+  if (Ty->isIntegerTy(8))
+    ConstVal = ConstantInt::get(Type::getInt8Ty(Context), Val);
+  else if (Ty->isIntegerTy(16))
+    ConstVal = ConstantInt::get(Type::getInt16Ty(Context), Val);
+  else if (Ty->isIntegerTy(32))
+    ConstVal = ConstantInt::get(Type::getInt32Ty(Context), Val);
+  else if (Ty->isIntegerTy(64))
+    ConstVal = ConstantInt::get(Type::getInt64Ty(Context), Val);
+  else if (Ty->isFloatTy())
+    ConstVal = ConstantFP::get(Type::getFloatTy(Context), Val);
+  else if (Ty->isDoubleTy())
+    ConstVal = ConstantFP::get(Type::getDoubleTy(Context), Val);
+
+  assert(ConstVal && "Could not generate constant for type");
+
+  return ConstVal;
+}
Index: lib/Analysis/VectorVariant.cpp
===================================================================
--- lib/Analysis/VectorVariant.cpp
+++ lib/Analysis/VectorVariant.cpp
@@ -0,0 +1,149 @@
+//===--------- VectorVariant.cpp - Vector function ABI -*- C++ -*----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VectorVariant class and corresponding utilities.
+/// VectorVariant objects are associated with a scalar function and are used
+/// to generate new functions that can be vectorized. VectorVariants are
+/// determined by inspecting the function attributes associated with the scalar
+/// function. When a mangled function name is found in the attributes (indicated
+/// as "_ZGV"), a VectorVariant object is created. The class and utilities
+/// in this file follow the standards defined in the vector function ABI.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/VectorVariant.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Generate a vector variant by decoding the mangled string for the
+/// variant contained in the original scalar function's attributes. For
+/// example: "_ZGVxN4". The name mangling is defined in the vector function
+/// ABI. Based on this string, the parameter kinds (uniform, linear, vector),
+/// vector length, parameter alignment, and masking are determined.
+VectorVariant::VectorVariant(StringRef FuncName) {
+
+  assert(isVectorVariant(FuncName) && "invalid vector variant format");
+
+  std::stringstream SST(FuncName.drop_front(prefix().size()));
+
+  // mandatory annotations
+  char EncodedISA;
+  SST.get(EncodedISA);
+  Isa = decodeISAClass(EncodedISA);
+
+  char EncodedMask;
+  SST.get(EncodedMask);
+  Mask = decodeMask(EncodedMask);
+  SST >> Vlen;
+
+  // optional parameter annotations
+  while (SST.peek() != '_') {
+
+    char Kind;
+    int Stride = VectorKind::notAValue();
+    int StrideSign = POSITIVE;
+    int Alignment = NOT_ALIGNED;
+
+    // Get parameter kind
+    SST.get(Kind);
+
+    // Default stride for linear is 1. If the stride for a parameter is 1,
+    // then the front-end will not encode it and we will not have set the
+    // correct stride below.
+    if (Kind == LINEAR_KIND)
+      Stride = 1;
+
+    // Handle optional stride
+    if (SST.peek() == 'n') {
+      // Stride is negative
+      SST.ignore(1);
+      StrideSign = NEGATIVE;
+    }
+
+    if (std::isdigit(SST.peek())) {
+      // Extract constant stride
+      SST >> Stride;
+      assert((Kind != STRIDE_KIND || Stride >= 0) &&
+             "variable stride argument index cannot be negative");
+    }
+
+    Stride *= StrideSign;
+    // Handle optional alignment
+    if (SST.peek() == 'a') {
+      SST.ignore(1);
+      SST >> Alignment;
+    }
+
+    VectorKind VecKind(Kind, Stride, Alignment);
+    Parameters.push_back(VecKind);
+  }
+
+  if (Mask) {
+    // Masked variants will have an additional mask parameter
+    VectorKind VecKind(VECTOR_KIND, VectorKind::notAValue());
+    Parameters.push_back(VecKind);
+  }
+}
+
+/// \brief Determine the vector variant's vector length based on the
+/// characteristic data type defined in the vector function ABI and target
+/// vector register width.
+unsigned int VectorVariant::calcVlen(ISAClass I,
+				     Type* CharacteristicDataType) {
+  assert(CharacteristicDataType &&
+	 CharacteristicDataType->getPrimitiveSizeInBits() != 0 &&
+	 "expected characteristic data type to have a primitive size in bits");
+
+  unsigned int VectorRegisterSize =
+    maximumSizeofISAClassVectorRegister(I, CharacteristicDataType);
+
+  return VectorRegisterSize / CharacteristicDataType->getPrimitiveSizeInBits();
+}
+
+/// \brief Determine the maximum vector register width based on the ISA classes
+/// defined in the vector function ABI.
+unsigned int VectorVariant::maximumSizeofISAClassVectorRegister(ISAClass I,
+                                                                Type* Ty)
+{
+  assert((Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() ||
+          Ty->isPointerTy()) && "unsupported type");
+
+  unsigned int VectorRegisterSize = 0;
+
+  switch (I) {
+    case XMM:
+      VectorRegisterSize = 128;
+      break;
+    case YMM1:
+      if (Ty->isIntegerTy() || Ty->isPointerTy())
+	VectorRegisterSize = 128;
+      else
+	VectorRegisterSize = 256;
+      break;
+    case YMM2:
+      if (Ty->isIntegerTy(8))
+	VectorRegisterSize = 128;
+      else
+	VectorRegisterSize = 256;
+      break;
+    case ZMM:
+      VectorRegisterSize = 512;
+      break;
+    default:
+      llvm_unreachable("unknown isa class");
+      return 0;
+  }
+
+  assert(VectorRegisterSize != 0 && "unsupported ISA/type combination");
+  return VectorRegisterSize;
+}
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Utils/VecClone.h"
 
 using namespace llvm;
 
@@ -102,6 +103,10 @@
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
+static cl::opt<bool> RunVecClone("enable-vec-clone",
+  cl::init(true), cl::Hidden,
+  cl::desc("Run Vector Function Cloning"));
+
 static cl::opt<bool> EnableNonLTOGlobalsModRef(
     "enable-non-lto-gmr", cl::init(true), cl::Hidden,
     cl::desc(
@@ -376,6 +381,10 @@
       MPM.add(createBarrierNoopPass());
 
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+
+    if (RunVecClone)
+      MPM.add(createVecClonePass());
+
     return;
   }
 
@@ -513,6 +522,9 @@
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false));
 
+  if (RunVecClone)
+    MPM.add(createVecClonePass());
+
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
Index: lib/Transforms/Utils/CMakeLists.txt
===================================================================
--- lib/Transforms/Utils/CMakeLists.txt
+++ lib/Transforms/Utils/CMakeLists.txt
@@ -44,6 +44,7 @@
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+  VecClone.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
Index: lib/Transforms/Utils/VecClone.cpp
===================================================================
--- lib/Transforms/Utils/VecClone.cpp
+++ lib/Transforms/Utils/VecClone.cpp
@@ -0,0 +1,1553 @@
+//=------- VecClone.cpp - Vector function to loop transform -*- C++ -*-------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===--------------------------------------------------------------------=== //
+///
+/// \file
+/// This pass inserts the body of a vector function inside a vector length
+/// trip count scalar loop for functions that are declared SIMD. The pass
+/// follows the Intel vector ABI requirements for name mangling encodings.
+///
+// ===--------------------------------------------------------------------=== //
+
+// This pass is flexible enough to deal with two forms of LLVM IR, namely the
+// difference between when Mem2Reg has been run and when Mem2Reg has not run.
+// i.e., the pass needs to be able to make the loop transformation at all
+// opt levels.
+//
+// When Mem2Reg has run:
+//
+// define i32 @foo(i32 %i, i32 %x) #0 {
+// entry:
+//   %add = add nsw i32 %x, %i
+//   ret i32 %add
+// }
+//
+// When Mem2Reg has not run:
+//
+// define i32 @foo(i32 %i, i32 %x) #0 {
+// entry:
+// %i.addr = alloca i32, align 4
+// %x.addr = alloca i32, align 4
+// store i32 %i, i32* %i.addr, align 4
+// store i32 %x, i32* %x.addr, align 4
+// %0 = load i32, i32* %x.addr, align 4
+// %1 = load i32, i32* %i.addr, align 4
+// %add = add nsw i32 %0, %1
+//  ret i32 %add
+// }
+//
+// When Mem2Reg has not been run (i.e., parameters have not been registerized),
+// we end up with an alloca, store to alloca, and load from alloca sequence.
+// When parameters have already been registerized, users of the parameter use
+// the parameter directly and not through a load of a new SSA temp. For either
+// case, this pass will expand the vector parameters/return to vector types,
+// alloca new space for them on the stack, and do an initial store to the
+// alloca. Linear and uniform parameters will be used directly, instead of
+// through a load instruction.
+
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/VectorVariant.h"
+#include "llvm/Transforms/Utils/VecClone.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+#include <map>
+#include <set>
+
+#define SV_NAME "vec-clone"
+#define DEBUG_TYPE "VecClone"
+
+using namespace llvm;
+
+VecClone::VecClone() : ModulePass(ID) { }
+
+void VecClone::getAnalysisUsage(AnalysisUsage &AU) const {
+  // Placeholder for any new pass dependencies. For now, none are needed.
+}
+
+void VecClone::insertInstruction(Instruction *Inst, BasicBlock *BB)
+{
+  // This function inserts instructions in a way that groups like instructions
+  // together for debuggability/readability purposes. This was designed to make
+  // the entry basic block easier to read since this pass creates/modifies
+  // alloca, store, and bitcast instructions for each vector parameter and
+  // return. Thus, this function ensures all allocas are grouped together, all
+  // stores are grouped together, and so on. If the type of instruction passed
+  // in does not exist in the basic block, then it is added to the end of the
+  // basic block, just before the terminator instruction.
+
+  BasicBlock::reverse_iterator BBIt = BB->rbegin();
+  BasicBlock::reverse_iterator BBEnd = BB->rend();
+  BasicBlock::iterator AnchorInstIt = BB->end();
+  AnchorInstIt--;
+  Instruction *Anchor = &*AnchorInstIt;
+
+  for (; BBIt != BBEnd; ++BBIt) {
+    if (Inst->getOpcode() == (&*BBIt)->getOpcode()) {
+      Anchor = &*BBIt;
+      break;
+    }
+  }
+
+  if (isa<BranchInst>(Anchor)) {
+    Inst->insertBefore(Anchor);
+  } else {
+    Inst->insertAfter(Anchor);
+  }
+}
+
+bool VecClone::hasComplexType(Function *F)
+{
+  Function::ArgumentListType &ArgList = F->getArgumentList();
+  Function::ArgumentListType::iterator ArgListIt = ArgList.begin();
+  Function::ArgumentListType::iterator ArgListEnd = ArgList.end();
+
+  for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+    // Complex types for parameters/return come in as vector.
+    if (ArgListIt->getType()->isVectorTy()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Function* VecClone::cloneFunction(Function &F, VectorVariant &V)
+{
+
+  DEBUG(dbgs() << "Cloning Function: " << F.getName() << "\n");
+  DEBUG(F.dump());
+
+  FunctionType* OrigFunctionType = F.getFunctionType();
+  Type *ReturnType = F.getReturnType();
+  Type *CharacteristicType = calcCharacteristicType(F, V);
+
+  // Expand return type to vector.
+  if (!ReturnType->isVoidTy())
+    ReturnType = VectorType::get(ReturnType, V.getVlen());
+
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+  SmallVector<Type*, 4> ParmTypes;
+  FunctionType::param_iterator ParmIt = OrigFunctionType->param_begin();
+  FunctionType::param_iterator ParmEnd = OrigFunctionType->param_end();
+  std::vector<VectorKind>::iterator VKIt = ParmKinds.begin();
+  for (; ParmIt != ParmEnd; ++ParmIt, ++VKIt) {
+    if (VKIt->isVector())
+      ParmTypes.push_back(VectorType::get((*ParmIt)->getScalarType(),
+                                          V.getVlen()));
+    else
+      ParmTypes.push_back(*ParmIt);
+  }
+
+  if (V.isMasked()) {
+    VectorType *MaskType = VectorType::get(CharacteristicType, V.getVlen());
+    ParmTypes.push_back(MaskType);
+  }
+
+  FunctionType* CloneFuncType = FunctionType::get(ReturnType, ParmTypes,
+                                                  false);
+
+  std::string VariantName = V.generateFunctionName(F.getName());
+  Function* Clone = Function::Create(CloneFuncType, F.getLinkage(),
+                                     VariantName, F.getParent());
+
+  // Remove vector variant attributes from the original function. They are
+  // not needed for the cloned function and it prevents any attempts at
+  // trying to clone the function again in case the pass is called more than
+  // once.
+  AttrBuilder AB;
+  for (auto Attr : getVectorVariantAttributes(F)) {
+    AB.addAttribute(Attr);
+  }
+  AttributeSet AttrsToRemove = AttributeSet::get(F.getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 AB);
+
+  F.removeAttributes(AttributeSet::FunctionIndex, AttrsToRemove);
+
+  // Copy all the attributes from the scalar function to its vector version
+  // except for the vector variant attributes.
+  Clone->copyAttributesFrom(&F);
+
+  // Remove incompatible argument attributes (applied to the scalar argument,
+  // does not apply to its vector counterpart).
+  Function::arg_iterator ArgIt = Clone->arg_begin();
+  Function::arg_iterator ArgEnd = Clone->arg_end();
+  for (uint64_t Idx = 1; ArgIt != ArgEnd; ++ArgIt, ++Idx) {
+    Type* ArgType = (*ArgIt).getType();
+    AB = AttributeFuncs::typeIncompatible(ArgType);
+    AttributeSet AS = AttributeSet::get(Clone->getContext(), Idx, AB);
+    (*ArgIt).removeAttr(AS);
+  }
+
+  ValueToValueMapTy Vmap;
+  ArgIt = F.arg_begin();
+  ArgEnd = F.arg_end();
+  Function::arg_iterator NewArgIt = Clone->arg_begin();
+  for (; ArgIt != ArgEnd; ++ArgIt, ++NewArgIt) {
+    NewArgIt->setName(ArgIt->getName());
+    Vmap[&*ArgIt] = &*NewArgIt;
+  }
+
+  if (V.isMasked()) {
+    Argument &MaskArg = *NewArgIt;
+    MaskArg.setName("mask");
+  }
+
+  SmallVector<ReturnInst*, 8> Returns;
+  CloneFunctionInto(Clone, &F, Vmap, true, Returns);
+
+  DEBUG(dbgs() << "After Cloning and Parameter/Return Expansion\n");
+  DEBUG(Clone->dump());
+
+  return Clone;
+}
+
+bool VecClone::isVectorOrLinearParamStore(
+    Function *Clone,
+    std::vector<VectorKind> &ParmKinds,
+    Instruction *Inst)
+{
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+    Value *Op0 = Store->getOperand(0);
+    Function::ArgumentListType &ArgList = Clone->getArgumentList();
+    Function::ArgumentListType::iterator ArgListIt = ArgList.begin();
+    Function::ArgumentListType::iterator ArgListEnd = ArgList.end();
+
+    for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+      unsigned ParmIdx = ArgListIt->getArgNo();
+      if (&*ArgListIt == Op0 &&
+          (ParmKinds[ParmIdx].isVector() || ParmKinds[ParmIdx].isLinear())) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+BasicBlock* VecClone::splitEntryIntoLoop(Function *Clone, VectorVariant &V,
+                                         BasicBlock *EntryBlock)
+{
+
+  // EntryInsts contains all instructions that need to stay in the entry basic
+  // block. These instructions include allocas and stores involving vector and
+  // linear parameters to alloca. Linear parameter stores to alloca are kept in
+  // the entry block because there will be a load from this alloca in the loop
+  // for which we will apply the stride. Instructions involving uniform
+  // parameter stores to alloca should be sunk into the loop to maintain
+  // uniform behavior. All instructions involving private variables are also
+  // sunk into the loop.
+
+  SmallVector<Instruction*, 4> EntryInsts;
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+  BasicBlock::iterator BBIt = EntryBlock->begin();
+  BasicBlock::iterator BBEnd = EntryBlock->end();
+
+  for (; BBIt != BBEnd; ++BBIt) {
+    if (isa<AllocaInst>(BBIt) ||
+        isVectorOrLinearParamStore(Clone, ParmKinds, &*BBIt)) {
+      // If this is a store of a vector parameter, keep it in the entry block
+      // because it will be modified with the vector alloca reference. Since the
+      // parameter has already been expanded, this becomes a vector store (i.e.,
+      // packing instruction) that we do not want to appear in the scalar loop.
+      // It is correct to leave linear parameter stores in the entry or move
+      // them to the scalar loop, but leaving them in the entry block prevents
+      // an additional store inside the loop. Uniform parameter stores must be
+      // moved to the loop body to behave as uniform. Consider the following:
+      //
+      // __declspec(vector(uniform(x)))
+      // int foo(int a, int x) {
+      //   x++;
+      //   return (a + x);
+      // }
+      //
+      // Assume x = 1 for the call to foo. This implies x = 2 for the vector
+      // add. e.g., a[0:VL-1] + <2, 2, 2, 2>. If the initial store of x to the
+      // stack is done in the entry block outside of the loop, then x will be
+      // incremented by one each time within the loop because the increment of
+      // x will reside in the loop. Therefore, if the store of x is sunk into
+      // the loop, the initial value of 1 will always be stored to a temp
+      // before the increment, resulting in the value of 2 always being computed
+      // in the scalar loop.
+      EntryInsts.push_back(&*BBIt);
+    }
+  }
+
+  BasicBlock *LoopBlock = EntryBlock->splitBasicBlock(EntryBlock->begin(),
+                                                      "simd.loop");
+
+  for (auto *Inst : EntryInsts) {
+    Inst->removeFromParent();
+    Inst->insertBefore(EntryBlock->getTerminator());
+  }
+
+  DEBUG(dbgs() << "After Entry Block Split\n");
+  DEBUG(Clone->dump());
+
+  return LoopBlock;
+}
+
+BasicBlock* VecClone::splitLoopIntoReturn(Function *Clone,
+                                          BasicBlock *LoopBlock)
+{
+
+  // Determine the basic block with the return. For simple cases, the 'ret'
+  // instruction will be part of the entry block. In this case, separate the
+  // 'ret' into a new basic block because we don't want this as part of the
+  // loop. For more complex cases, the 'ret' and corresponding instructions
+  // (i.e., load from auto variable) will already be in a separate basic block,
+  // so no need to split here.
+
+  Instruction *SplitPt = LoopBlock->getTerminator();
+
+  if (ReturnInst *Return = dyn_cast<ReturnInst>(SplitPt)) {
+
+    // If the return is from a preceeding load, make sure the load is also put
+    // in the return block. This is the old scalar load that will end up getting
+    // replaced with the vector return and will get cleaned up later.
+
+    // Make sure this is not a void function before getting the return
+    // operand.
+    if (!Clone->getReturnType()->isVoidTy()) {
+      Value *RetOp = Return->getOperand(0);
+      Value::use_iterator UseIt = RetOp->use_begin();
+      Value::use_iterator UseEnd = RetOp->use_end();
+
+      for (; UseIt != UseEnd; ++UseIt) {
+        LoadInst *RetLoad = dyn_cast<LoadInst>(*UseIt);
+        if (RetLoad) {
+          SplitPt = RetLoad;
+        }
+      }
+    }
+  }
+
+  Function::iterator ReturnBlockIt = Clone->end();
+  BasicBlock *ReturnBlock;
+  if (dyn_cast<LoadInst>(SplitPt) || dyn_cast<ReturnInst>(SplitPt)) {
+    ReturnBlock = LoopBlock->splitBasicBlock(SplitPt, "return");
+  } else {
+    ReturnBlockIt = Clone->end();
+    ReturnBlockIt--;
+    ReturnBlock = &*ReturnBlockIt;
+  }
+
+  return ReturnBlock;
+}
+
+void VecClone::updateReturnPredecessors(Function *Clone,
+                                        BasicBlock *LoopExitBlock,
+                                        BasicBlock *ReturnBlock)
+{ 
+  // Update the branches of the ReturnBlock predecessors to point back to
+  // LoopBlock if the index is less than VL.
+
+  // First, collect the basic blocks to be updated since we don't want to update
+  // the CFG while iterating through it.
+  SmallVector<BranchInst*, 4> BranchesToUpdate;
+  Function::iterator FI = Clone->begin();
+  Function::iterator FE = Clone->end();
+  for (; FI != FE; ++FI) {
+
+    BasicBlock::iterator BBI = FI->begin();
+    BasicBlock::iterator BBE = FI->end();
+
+    for (; BBI != BBE; ++BBI) {
+
+      BranchInst *Branch = dyn_cast<BranchInst>(BBI);
+
+      if (Branch) {
+        unsigned NumSucc = Branch->getNumSuccessors();
+
+        for (unsigned I = 0; I < NumSucc; ++I) {
+          if (Branch->getSuccessor(I) == ReturnBlock) {
+            BranchesToUpdate.push_back(Branch);
+          }
+        }
+      }
+    }
+  }
+
+  // Now, do the actual update. The code below handles both conditional and
+  // unconditional branches because we loop through all successors of the
+  // branch to see if any of them point to the ReturnBlock.
+  for (unsigned I = 0; I < BranchesToUpdate.size(); ++I) {
+    unsigned int NumOps = BranchesToUpdate[I]->getNumSuccessors();
+    for (unsigned Idx = 0; Idx < NumOps; ++Idx) {
+      BasicBlock *Successor = BranchesToUpdate[I]->getSuccessor(Idx);
+      if (Successor == ReturnBlock) {
+        BranchesToUpdate[I]->setOperand(Idx, LoopExitBlock);
+      }
+    }
+  }
+}
+
+BasicBlock* VecClone::createLoopExit(Function *Clone, BasicBlock *ReturnBlock)
+{
+  BasicBlock *LoopExitBlock = BasicBlock::Create(Clone->getContext(),
+                                                 "simd.loop.exit",
+                                                 Clone, ReturnBlock);
+
+  updateReturnPredecessors(Clone, LoopExitBlock, ReturnBlock);
+  return LoopExitBlock;
+}
+
+PHINode* VecClone::createPhiAndBackedgeForLoop(
+    Function *Clone,
+    BasicBlock *EntryBlock,
+    BasicBlock *LoopBlock,
+    BasicBlock *LoopExitBlock,
+    BasicBlock *ReturnBlock,
+    int VectorLength)
+{
+                                                        
+  // Create the phi node for the top of the loop block and add the back
+  // edge to the loop from the loop exit.
+
+  PHINode *Phi = PHINode::Create(Type::getInt32Ty(Clone->getContext()), 2,
+                                 "index", &*LoopBlock->getFirstInsertionPt());
+
+  Constant *Inc = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 1);
+  Constant *IndInit = ConstantInt::get(Type::getInt32Ty(Clone->getContext()),
+                                       0);
+
+  Instruction *Induction = BinaryOperator::CreateNUWAdd(Phi, Inc, "indvar",
+                                                        LoopExitBlock);
+
+  Constant *VL = ConstantInt::get(Type::getInt32Ty(Clone->getContext()),
+                                  VectorLength);
+
+  Instruction *VLCmp = new ICmpInst(*LoopExitBlock, CmpInst::ICMP_ULT,
+                                    Induction, VL, "vl.cond");
+
+  BranchInst::Create(LoopBlock, ReturnBlock, VLCmp, LoopExitBlock);
+
+  Phi->addIncoming(IndInit, EntryBlock);
+  Phi->addIncoming(Induction, LoopExitBlock);
+
+  DEBUG(dbgs() << "After Loop Insertion\n");
+  DEBUG(Clone->dump());
+
+  return Phi;
+}
+
+Instruction* VecClone::expandVectorParameters(
+    Function *Clone,
+    VectorVariant &V,
+    BasicBlock *EntryBlock,
+    std::vector<ParmRef*>& VectorParmMap)
+{
+  // For vector parameters, expand the existing alloca to a vector. Then,
+  // bitcast the vector and store this instruction in a map. The map is later
+  // used to insert the new instructions and to replace the old scalar memory
+  // references. If there are no parameters, then the function simply does not
+  // perform any expansion since we iterate over the function's arg list.
+
+  Instruction *Mask = nullptr;
+  SmallVector<StoreInst*, 4> StoresToInsert;
+
+  Function::ArgumentListType &CloneArgList = Clone->getArgumentList();
+  Function::ArgumentListType::iterator ArgIt = CloneArgList.begin();
+  Function::ArgumentListType::iterator ArgEnd = CloneArgList.end();
+
+  for (; ArgIt != ArgEnd; ++ArgIt) {
+
+    User::user_iterator UserIt = ArgIt->user_begin();
+    User::user_iterator UserEnd = ArgIt->user_end();
+
+    VectorType *VecType = dyn_cast<VectorType>(ArgIt->getType());
+
+    if (VecType) {
+
+      // Create a new vector alloca and bitcast to a pointer to the
+      // element type. The following is an example of what the cast
+      // should look like:
+      //
+      // %veccast = bitcast <2 x i32>* %vec_a.addr to i32*
+      //
+      // geps using the bitcast will appear in a scalar form instead
+      // of casting to an array or using vector. For example,
+      //
+      // %vecgep1 = getelementptr i32, i32* %veccast, i32 %index
+      //
+      // instead of:
+      //
+      // getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 1
+      //
+      // We do this to put the geps in a more scalar form.
+
+      Twine VarName = "vec." + ArgIt->getName();
+      AllocaInst *VecAlloca = new AllocaInst(VecType, VarName);
+      insertInstruction(VecAlloca, EntryBlock);
+      PointerType *ElemTypePtr =
+          PointerType::get(VecType->getElementType(),
+                           VecAlloca->getType()->getAddressSpace());
+
+      BitCastInst *VecParmCast = nullptr;
+      if (ArgIt->getNumUses() == 0 && V.isMasked()) {
+        Mask = new BitCastInst(VecAlloca, ElemTypePtr, "mask.cast");
+      } else {
+        Twine CastName = "vec." + ArgIt->getName() + ".cast";
+        VecParmCast = new BitCastInst(VecAlloca, ElemTypePtr, CastName);
+        insertInstruction(VecParmCast, EntryBlock);
+      }
+
+      for (; UserIt != UserEnd; ++UserIt) {
+
+        StoreInst *StoreUser = dyn_cast<StoreInst>(*UserIt);
+        AllocaInst *Alloca = NULL;
+        ParmRef *PRef = new ParmRef();
+
+        if (StoreUser) {
+
+          // For non-mask parameters, find the initial store of the parameter
+          // to an alloca instruction. Map this alloca to the vector bitcast
+          // created above so that we can update the old scalar references.
+
+          Alloca = dyn_cast<AllocaInst>(UserIt->getOperand(1));
+          PRef->VectorParm = Alloca;
+        } else {
+          // Since Mem2Reg has run, there is no existing scalar store for
+          // the parameter, but we must still pack (store) the expanded vector
+          // parameter to a new vector alloca. This store is created here and
+          // put in a container for later insertion. We cannot insert it here
+          // since this will be a new user of the parameter and we are still
+          // iterating over the original users of the parameter. This will
+          // invalidate the iterator. We also map the parameter directly to the
+          // vector bitcast so that we can later update any users of the
+          // parameter.
+
+          Value *ArgValue = dyn_cast<Value>(ArgIt);
+          StoreInst *Store = new StoreInst(ArgValue, VecAlloca);
+          StoresToInsert.push_back(Store);
+          PRef->VectorParm = ArgValue;
+        }
+
+        PRef->VectorParmCast = VecParmCast;
+        VectorParmMap.push_back(PRef);
+      }
+    }
+  }
+
+  // Insert any necessary vector parameter stores here. This is needed for when
+  // there were no existing scalar stores that we can update to vector stores
+  // for the parameter. This is needed when Mem2Reg has registerized parameters.
+  // The stores are inserted after the allocas in the entry block.
+  for (auto *Inst : StoresToInsert) {
+    insertInstruction(Inst, EntryBlock);
+  }
+
+  return Mask;
+}
+
+Instruction* VecClone::createExpandedReturn(Function *Clone,
+                                            BasicBlock *EntryBlock,
+                                            VectorType *ReturnType)
+{
+  // Expand the return temp to a vector.
+
+  Twine VarName = "vec.retval";
+  VectorType *AllocaType = dyn_cast<VectorType>(Clone->getReturnType());
+
+  AllocaInst *VecAlloca = new AllocaInst(AllocaType, VarName);
+  insertInstruction(VecAlloca, EntryBlock);
+  PointerType *ElemTypePtr =
+      PointerType::get(ReturnType->getElementType(),
+                       VecAlloca->getType()->getAddressSpace());
+
+  BitCastInst *VecCast = new BitCastInst(VecAlloca, ElemTypePtr, "ret.cast");
+  insertInstruction(VecCast, EntryBlock);
+
+  return VecCast;
+}
+
+Instruction* VecClone::expandReturn(Function *Clone, BasicBlock *EntryBlock,
+                                    BasicBlock *LoopBlock,
+                                    BasicBlock *ReturnBlock,
+                                    std::vector<ParmRef*>& VectorParmMap)
+{
+  // Determine how the return is currently handled, since this will determine
+  // if a new vector alloca is required for it. For simple functions, an alloca
+  // may not have been created for the return value. The function may just
+  // simply return a value defined by some operation that now exists within the
+  // loop. If an alloca was generated already, then the return block will load
+  // from it and then return. Thus, we look for a return resulting from a load
+  // in the return block. If found, we have already expanded all alloca
+  // instructions to vector types and the old scalar references have already
+  // been replaced with them. In this case, we only need to pack the results
+  // from the vector alloca into a temp and return the temp. If a vector alloca
+  // was not generated for the return, we need to add one for it because we have
+  // a scalar reference in the loop that needs to be replaced. After creating
+  // the new vector alloca, replace the reference to it in the loop and then
+  // pack the results into a temp and return it.
+  //
+  // Example 1: // alloca not generated in entry block
+  //
+  // loop:
+  //   ... // some set of instructions
+  //   %add1 = add nsw i32 %1, %2
+  //   br label %loop.exit (loop exit contains br to return block)
+  //
+  // return:
+  //   ret i32 %add1
+  //
+  //
+  // Example 2:
+  //
+  // loop:
+  //  ... // some set of instructions
+  //   %vecgep1 = getelementptr <2 x i32>* %vec_ret, i32 0, i32 %index
+  //   store i32 %add2, i32* %vecgep1
+  //   br label %loop.exit (loop exit contains br to return block)
+  //
+  // return:
+  //   %7 = load i32, i32*, %retval // the original scalar alloca 
+  //   ret i32 %7
+  //
+
+  ReturnInst *FuncReturn = dyn_cast<ReturnInst>(ReturnBlock->getTerminator());
+  assert(FuncReturn && "Expected ret instruction to terminate the return\
+                        basic block");
+
+  LoadInst *LoadFromAlloca = dyn_cast<LoadInst>(FuncReturn->getOperand(0));
+
+  // We need to generate a vector alloca for the return vector.
+  // Two cases exist, here:
+  // 
+  // 1) For simple functions, the return is a temp defined within the
+  //    loop body and the temp is not loaded from an alloca, or the return is
+   //   a constant. (obviously, also not loaded from an alloca)
+  // 
+  // 2) The return temp traces back to an alloca.
+  // 
+  // For both cases, generate a vector alloca so that we can later load from it
+  // and return the vector temp from the function. The alloca is used to load
+  // and store from so that the scalar loop contains load/store/gep
+  // instructions. This enables AVR construction to remain straightforward.
+  // E.g., we don't need to worry about figuring out how to represent
+  // insert/extract when building AVR nodes. This keeps consistent with how ICC
+  // is operating.
+  // 
+  // Additionally, for case 1 we must generate a gep and store after the
+  // instruction that defines the original return temp, so that we can store
+  // the result into the proper index of the return vector. For case 2, we must
+  // go into the loop and replace the old scalar alloca reference with the one
+  // just created as vector.
+
+  Instruction *VecReturn = NULL;
+  VectorType *ReturnType = dyn_cast<VectorType>(Clone->getReturnType());
+
+  if (!LoadFromAlloca) {
+
+    // Case 1
+
+    VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType);
+    Value *RetVal = FuncReturn->getReturnValue();
+    Instruction *RetFromTemp = dyn_cast<Instruction>(RetVal);
+
+    Instruction *InsertPt;
+    Value *ValToStore;
+    Instruction *Phi = &*LoopBlock->begin();
+
+    if (RetFromTemp) {
+      // If we're returning from an SSA temp, set the insert point to the
+      // definition of the temp.
+      InsertPt = RetFromTemp;
+      ValToStore = RetFromTemp;
+    } else {
+      // If we're returning a constant, then set the insert point to the loop
+      // phi. From here, a store to the vector using the constant is inserted.
+      InsertPt = Phi;
+      ValToStore = RetVal;
+    }
+
+    // Generate a gep from the bitcast of the vector alloca used for the return
+    // vector.
+    Twine GepName = VecReturn->getName() + ".gep";
+    GetElementPtrInst *VecGep =
+        GetElementPtrInst::Create(ReturnType->getElementType(),
+                                  VecReturn, Phi, GepName);
+    VecGep->insertAfter(InsertPt);
+
+    // Store the constant or temp to the appropriate lane in the return vector.
+    StoreInst *VecStore = new StoreInst(ValToStore, VecGep);
+    VecStore->insertAfter(VecGep);
+
+  } else {
+
+    // Case 2
+
+    AllocaInst *Alloca = dyn_cast<AllocaInst>(LoadFromAlloca->getOperand(0));
+    bool AllocaFound = false;
+    unsigned ParmIdx = 0;
+
+    for (; ParmIdx < VectorParmMap.size(); ParmIdx++) {
+      Value *ParmVal = VectorParmMap[ParmIdx]->VectorParm;
+      if (ParmVal == Alloca)
+        AllocaFound = true;
+    }
+
+    if (AllocaFound) {
+      // There's already a vector alloca created for the return, which is the
+      // same one used for the parameter. E.g., we're returning the updated
+      // parameter.
+      VecReturn = VectorParmMap[ParmIdx]->VectorParmCast;
+    } else {
+      // A new return vector is needed because we do not load the return value
+      // from an alloca.
+      VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType);
+      ParmRef *PRef = new ParmRef();
+      PRef->VectorParm = Alloca;
+      PRef->VectorParmCast = VecReturn;
+      VectorParmMap.push_back(PRef);
+    }
+  }
+
+  return VecReturn;
+}
+
+Instruction* VecClone::expandVectorParametersAndReturn(
+    Function *Clone,
+    VectorVariant &V,
+    Instruction **Mask,
+    BasicBlock *EntryBlock,
+    BasicBlock *LoopBlock,
+    BasicBlock *ReturnBlock,
+    std::vector<ParmRef*>& VectorParmMap)
+{
+  // If there are no parameters, then this function will do nothing and this
+  // is the expected behavior.
+  *Mask = expandVectorParameters(Clone, V, EntryBlock, VectorParmMap);
+
+  // If the function returns void, then don't attempt to expand to vector.
+  Instruction *ExpandedReturn = ReturnBlock->getTerminator();
+  if (!Clone->getReturnType()->isVoidTy()) {
+    ExpandedReturn = expandReturn(Clone, EntryBlock, LoopBlock, ReturnBlock,
+                                  VectorParmMap);
+  }
+
+  // So, essentially what has been done to this point is the creation and
+  // insertion of the vector alloca instructions. Now, we insert the bitcasts of
+  // those instructions, which have been stored in the map. The insertion of the
+  // vector bitcast to element type pointer is done at the end of the EntryBlock
+  // to ensure that any initial stores of vector parameters have been done
+  // before the cast.
+
+  std::vector<ParmRef*>::iterator MapIt;
+  for (auto MapIt : VectorParmMap) {
+    Instruction *ExpandedCast = MapIt->VectorParmCast;
+    if (!ExpandedCast->getParent()) {
+      insertInstruction(ExpandedCast, EntryBlock);
+    }
+  }
+
+  // Insert the mask parameter store to alloca and bitcast if this is a masked
+  // variant.
+  if (*Mask) {
+    // Mask points to the bitcast of the alloca instruction to element type
+    // pointer. Insert the bitcast after all of the other bitcasts for vector
+    // parameters.
+    insertInstruction(*Mask, EntryBlock);
+
+    Value *MaskVector = (*Mask)->getOperand(0);
+
+    // MaskParm points to the function's mask parameter.
+    Function::ArgumentListType &ArgList = Clone->getArgumentList();
+    Function::ArgumentListType::iterator MaskParm = ArgList.end();
+    MaskParm--;
+
+    // Find the last parameter store in the function entry block and insert the
+    // the store of the mask parameter after it. We do this just to make the
+    // LLVM IR easier to read. If there are no parameters, just insert the store
+    // before the terminator. For safety, if we cannot find a store, then insert
+    // this store after the last alloca. At this point, there will at least be
+    // an alloca for either a parameter or return. This code just ensures that
+    // the EntryBlock instructions are grouped by alloca, followed by store,
+    // followed by bitcast for readability reasons.
+
+    StoreInst *MaskStore = new StoreInst(&*MaskParm, MaskVector);
+    insertInstruction(MaskStore, EntryBlock);
+  }
+
+  DEBUG(dbgs() << "After Parameter/Return Expansion\n");
+  DEBUG(Clone->dump());
+
+  return ExpandedReturn;
+}
+
+bool VecClone::typesAreCompatibleForLoad(Type *GepType, Type *LoadType)
+{
+  // GepType will always be a pointer since this refers to an alloca for a
+  // vector.
+  PointerType *GepPtrTy = dyn_cast<PointerType>(GepType);
+  Type *LoadFromTy = GepPtrTy->getElementType();
+  Type *LoadToTy = LoadType;
+
+  // Dereferencing pointers in LLVM IR means that we have to have a load for
+  // each level of indirection. This means that we load from a gep and the
+  // resulting load value type is reduced by one level of indirection. For
+  // example, we load from a gep of i32* to a temp that has an i32 type. We
+  // cannot do multiple levels of dereferencing in a single load. For example,
+  // we cannot load from a gep of i32** to an i32. This requires two loads.
+  //
+  // Legal Case: GepType = i32**, LoadFromTy = i32*,
+  //             LoadType = i32*, LoadToTy = i32*
+  // 
+  // %vec.b.elem.2 = load i32*, i32** %vec.b.cast.gep1
+  //
+  // In this case, since both are pointers, types will be considered equal by
+  // LLVM, so we must continue getting the element types of each pointer type
+  // until one is no longer a pointer type. Then do an equality check.
+  //
+  // Legal Case: GepType = i32*, LoadFromTy = i32,
+  //             LoadType = i32, LoadToTy = i32
+  //
+  // %vec.b.elem.2 = load i32, i32* %vec.b.cast.gep1
+  //
+  // Ready to compare as is
+  //
+  // Illegal Case: GepType = i32**, LoadFromTy = i32*
+  //               LoadType = i32, LoadToTy = i32
+  //
+  // %vec.b.elem.2 = load i32, i32** %vec.b.cast.gep1
+  //
+  // This case arises due to differences in the LLVM IR at -O0 and >= -O1.
+  // For >= -O1, Mem2Reg registerizes parameters and there are no alloca
+  // instructions created for function parameters. At -O0, vector parameters
+  // are expanded and we modify the existing alloca that was used for the scalar
+  // parameter. When there is no alloca for vector parameters, we must create
+  // one for them. Thus, we have introduced an additional level of indirection
+  // for users of parameters at >= -O1. This can become a problem for load
+  // instructions and results in this illegal case. This function helps to
+  // check that we are not attempting to do an extra level of indirection
+  // within the load instructions for elements of vector parameters in the
+  // simd loop. If an illegal case is encountered, an additional load is
+  // inserted to account for the extra level of indirection and any users are
+  // updated accordingly.
+
+  while (LoadFromTy->getTypeID() == Type::PointerTyID &&
+         LoadToTy->getTypeID()   == Type::PointerTyID) {
+
+    PointerType *FromPtrTy = cast<PointerType>(LoadFromTy);
+    PointerType *ToPtrTy = cast<PointerType>(LoadToTy);
+
+    LoadFromTy = FromPtrTy->getElementType();
+    LoadToTy = ToPtrTy->getElementType();
+  }
+
+  if (LoadFromTy->getTypeID() == LoadToTy->getTypeID()) {
+    return true;
+  }
+
+  return false;
+}
+
+void VecClone::updateScalarMemRefsWithVector(
+    Function *Clone,
+    Function &F,
+    BasicBlock *EntryBlock,
+    BasicBlock *ReturnBlock,
+    PHINode *Phi,
+    std::vector<ParmRef*>& VectorParmMap)
+{
+  // This function replaces the old scalar uses of a parameter with a reference
+  // to the new vector one. A gep is inserted using the vector bitcast created
+  // in the entry block and any uses of the parameter are replaced with this
+  // gep. The only users that will not be updated are those in the entry block
+  // that do the initial store to the vector alloca of the parameter.
+
+  std::vector<ParmRef*>::iterator VectorParmMapIt;
+
+  for (auto VectorParmMapIt : VectorParmMap) {
+
+    SmallVector<Instruction*, 4> InstsToUpdate;
+    Value *Parm = VectorParmMapIt->VectorParm;
+    Instruction *Cast = VectorParmMapIt->VectorParmCast;
+
+    for (User *U : Parm->users()) {
+      InstsToUpdate.push_back(dyn_cast<Instruction>(U));
+    }
+
+    for (unsigned I = 0; I < InstsToUpdate.size(); ++I) {
+
+      Instruction *User = InstsToUpdate[I];
+      if (!(dyn_cast<StoreInst>(User) && User->getParent() == EntryBlock)) {
+
+        BitCastInst *BitCast = dyn_cast<BitCastInst>(Cast);
+        PointerType *BitCastType = dyn_cast<PointerType>(BitCast->getType());
+        Type *PointeeType = BitCastType->getElementType();
+
+        Twine GepName = BitCast->getName() + ".gep";
+        GetElementPtrInst *VecGep =
+            GetElementPtrInst::Create(PointeeType, BitCast, Phi, GepName,
+                                      User);
+
+        unsigned NumOps = User->getNumOperands();
+        for (unsigned I = 0; I < NumOps; ++I) {
+          if (User->getOperand(I) == Parm) {
+
+            bool TypesAreCompatible = false;
+
+            if (isa<LoadInst>(User)) {
+              TypesAreCompatible =
+                  typesAreCompatibleForLoad(VecGep->getType(), User->getType());
+            }
+
+            if ((isa<LoadInst>(User) && TypesAreCompatible) ||
+                isa<StoreInst>(User)) {
+              // If the user is a load/store and the dereferencing is legal,
+              // then just modify the load/store operand to use the gep.
+              User->setOperand(I, VecGep);
+            } else {
+              // Otherwise, we need to load the value from the gep first before
+              // using it. This effectively loads the particular element from
+              // the vector parameter.
+              Twine LoadName = "vec." + Parm->getName() + ".elem";
+              LoadInst *ParmElemLoad = new LoadInst(VecGep, LoadName); 
+              ParmElemLoad->insertAfter(VecGep);
+              User->setOperand(I, ParmElemLoad);
+            }
+          }
+        }
+      } else {
+        // The user is the parameter store to alloca in the entry block. Replace
+        // the old scalar alloca with the new vector one.
+        AllocaInst *VecAlloca = dyn_cast<AllocaInst>(Cast->getOperand(0));
+        User->setOperand(1, VecAlloca);
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "After Alloca Replacement\n");
+  DEBUG(Clone->dump());
+}
+
+Instruction* VecClone::generateStrideForParameter(
+    Function *Clone,
+    Argument *Arg,
+    Instruction *ParmUser,
+    int Stride,
+    PHINode *Phi)
+{
+  // Value returned as the last instruction needed to update the users of the
+  // old parameter reference.
+  Instruction *StrideInst = nullptr;
+
+  Constant *StrideConst =
+      ConstantInt::get(Type::getInt32Ty(Clone->getContext()), Stride);
+
+  Instruction *Mul = BinaryOperator::CreateMul(StrideConst, Phi, "stride.mul");
+
+  // Insert the stride related instructions after the user if the instruction
+  // involves a redefinition of the parameter. For example, a load from the
+  // parameter's associated alloca or a cast. For these situations, we want to
+  // apply the stride to this SSA temp. For other instructions, e.g., add, the
+  // instruction computing the stride must be inserted before the user.
+
+  if (!isa<UnaryInstruction>(ParmUser)) {
+    Mul->insertBefore(ParmUser);
+  } else {
+    Mul->insertAfter(ParmUser);
+  }
+
+  if (Arg->getType()->isPointerTy()) {
+
+    // Linear updates to pointer parameters involves an address calculation, so
+    // use gep. To properly update linear pointers we only need to multiply the
+    // loop index and stride since gep is indexed starting at 0 from the base
+    // address passed to the vector function.
+    PointerType *ParmPtrType = dyn_cast<PointerType>(Arg->getType());
+
+    // The base address used for linear gep computations.
+    Value *BaseAddr = nullptr;
+    StringRef RefName;
+
+    if (LoadInst *ParmLoad = dyn_cast<LoadInst>(ParmUser)) {
+      // We are loading from the alloca of the pointer parameter (no Mem2Reg)
+      // i.e., loading a pointer to an SSA temp.
+      BaseAddr = ParmUser;
+      RefName = ParmLoad->getOperand(0)->getName();
+    } else {
+      // The user is using the pointer parameter directly.
+      BaseAddr = Arg;
+      RefName = BaseAddr->getName();
+    }
+
+    // Mul is always generated as i32 since it is calculated using the i32 loop
+    // phi that is inserted by this pass. No cast on Mul is necessary because
+    // gep can use a base address of one type with an index of another type.
+    Twine GepName = RefName + ".gep";
+    GetElementPtrInst *LinearParmGep =
+        GetElementPtrInst::Create(ParmPtrType->getElementType(),
+                                  BaseAddr, Mul, GepName);
+
+    LinearParmGep->insertAfter(Mul);
+    StrideInst = LinearParmGep;
+  } else {
+    // For linear values, a mul/add sequence is needed to generate the correct
+    // value. i.e., val = linear_var * stride + loop_index;
+    //
+    // Also, Mul above is generated as i32 because the phi type is always i32.
+    // However, ParmUser may be another integer type, so we must convert i32 to
+    // i8/i16/i64 when the user is not i32.
+
+    if (ParmUser->getType()->getIntegerBitWidth() !=
+        Mul->getType()->getIntegerBitWidth()) {
+
+      Instruction *MulConv =
+              CastInst::CreateIntegerCast(Mul, ParmUser->getType(), true,
+                                          "stride.cast");
+      MulConv->insertAfter(Mul);
+      Mul = MulConv;
+    }
+
+    // Generate the instruction that computes the stride.
+    BinaryOperator *Add;
+    StringRef TempName = "stride.add";
+    if (isa<UnaryInstruction>(ParmUser)) {
+      // The user of the parameter is an instruction that results in a
+      // redefinition of it. e.g., a load from an alloca (no Mem2Reg) or a cast
+      // instruction. In either case, the stride needs to be applied to this
+      // temp.
+      Add = BinaryOperator::CreateAdd(ParmUser, Mul, TempName);
+    } else {
+      // Otherwise, the user is an instruction that does not redefine the temp,
+      // such as an add instruction. For these cases, the stride must be
+      // computed before the user and the reference to the parameter must be
+      // replaced with this instruction.
+      Add = BinaryOperator::CreateAdd(Arg, Mul, TempName);
+    }
+
+    Add->insertAfter(Mul);
+    StrideInst = Add;
+  }
+
+  return StrideInst;
+}
+
+void VecClone::updateLinearReferences(Function *Clone, Function &F,
+                                      VectorVariant &V, PHINode *Phi)
+{
+  // Add stride to parameters marked as linear. This is done by finding all
+  // users of the scalar alloca associated with the parameter. The user should
+  // be a load from this alloca to a temp. The stride is then added to this temp
+  // and its uses are replaced with the new temp. Or, if Mem2Reg eliminates the
+  // alloca/load, the parameter is used directly and this use is updated with
+  // the stride.
+
+  Function::ArgumentListType &ArgList = Clone->getArgumentList();
+  Function::ArgumentListType::iterator ArgListIt = ArgList.begin();
+  Function::ArgumentListType::iterator ArgListEnd = ArgList.end();
+  std::vector<VectorKind> ParmKinds = V.getParameters();
+
+  for (; ArgListIt != ArgListEnd; ++ArgListIt) {
+
+    User::user_iterator ArgUserIt = ArgListIt->user_begin();
+    User::user_iterator ArgUserEnd = ArgListIt->user_end();
+    unsigned ParmIdx = ArgListIt->getArgNo();
+    SmallVector<Instruction*, 4> LinearParmUsers;
+
+    if (ParmKinds[ParmIdx].isLinear()) {
+
+      int Stride = ParmKinds[ParmIdx].getStride();
+
+      for (; ArgUserIt != ArgUserEnd; ++ArgUserIt) {
+
+        // Collect all uses of the parameter so that they can later be used to
+        // apply the stride.
+        Instruction *ParmUser = dyn_cast<Instruction>(*ArgUserIt);
+        if (StoreInst *ParmStore = dyn_cast<StoreInst>(ParmUser)) {
+
+          // This code traces the store of the parameter to its associated
+          // alloca. Then, we look for a load from that alloca to a temp. This
+          // is the value we need to add the stride to. This is for when
+          // Mem2Reg has not been run.
+          AllocaInst *Alloca = dyn_cast<AllocaInst>(ArgUserIt->getOperand(1));
+
+          if (Alloca) {
+            for (auto *AU : Alloca->users()) {
+
+              LoadInst *ParmLoad = dyn_cast<LoadInst>(AU);
+
+              if (ParmLoad) {
+                // The parameter is being loaded from an alloca to a new SSA
+                // temp. We must replace the users of this load with an
+                // instruction that adds the result of this load with the
+                // stride.
+                LinearParmUsers.push_back(ParmLoad);
+              }
+            }
+          } else {
+            // Mem2Reg has run, so the parameter is directly referenced in the
+            // store instruction.
+            LinearParmUsers.push_back(ParmStore);
+          }
+        } else {
+          // Mem2Reg has registerized the parameters, so users of it will use
+          // it directly, and not through a load of the parameter.
+          LinearParmUsers.push_back(ParmUser);
+        }
+
+        for (unsigned I = 0; I < LinearParmUsers.size(); I++) {
+          // For each user of parameter:
+
+          // We must deal with two cases here, based on whether Mem2Reg has been
+          // run.
+          //
+          // Example:
+          //
+          // __declspec(vector(linear(i:1),uniform(x),vectorlength(4)))
+          // extern int foo(int i, int x) {
+          //   return (x + i);
+          // }
+          //
+          // 1) We are loading the parameter from an alloca and the SSA temp as
+          //    as a result of the load is what we need to add the stride to.
+          //    Then, any users of that temp must be replaced. The only load
+          //    instructions put in the collection above are guaranteed to be
+          //    associated with the parameter's alloca. Thus, we only need to
+          //    check to see if a load is in the map to know what to do.
+          //
+          // Before Linear Update:
+          //
+          // simd.loop:                     ; preds = %simd.loop.exit, %entry
+          //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+          //   store i32 %x, i32* %x.addr, align 4
+          //   %0 = load i32, i32* %x.addr, align 4
+          //   %1 = load i32, i32* %i.addr, align 4 <--- %i
+          //   %add = add nsw i32 %0, %1            <--- replace %1 with stride
+          //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+          //   store i32 %add, i32* %ret.cast.gep
+          //   br label %simd.loop.exit
+          //
+          // After Linear Update:
+          //
+          // simd.loop:                     ; preds = %simd.loop.exit, %entry
+          //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+          //   store i32 %x, i32* %x.addr, align 4
+          //   %0 = load i32, i32* %x.addr, align 4
+          //   %1 = load i32, i32* %i.addr, align 4
+          //   %stride.mul = mul i32 1, %index
+          //   %stride.add = add i32 %1, %stride.mul <--- stride
+          //   %add = add nsw i32 %0, %stride.add    <--- new %i with stride
+          //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+          //   store i32 %add, i32* %ret.cast.gep
+          //   br label %simd.loop.exit
+          //
+          // 2) The user uses the parameter directly, and so we must apply the
+          //    stride directly to the parameter. Any users of the parameter
+          //    must then be updated.
+          //
+          // Before Linear Update:
+          //
+          // simd.loop:                     ; preds = %simd.loop.exit, %entry
+          //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+          //   %add = add nsw i32 %x, %i <-- direct usage of %i
+          //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+          //   store i32 %add, i32* %ret.cast.gep
+          //   br label %simd.loop.exit
+          //
+          // After Linear Update:
+          //
+          // simd.loop:                     ; preds = %simd.loop.exit, %entry
+          //   %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+          //   %stride.mul = mul i32 1, %index
+          //   %stride.add = add i32 %i, %stride.mul <--- stride
+          //   %add = add nsw i32 %x, %stride.add    <--- new %i with stride
+          //   %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+          //   store i32 %add, i32* %ret.cast.gep
+          //   br label %simd.loop.exit
+
+          Instruction *StrideInst =
+              generateStrideForParameter(Clone, &*ArgListIt, LinearParmUsers[I],
+                                         Stride, Phi);
+
+          SmallVector<Instruction*, 4> InstsToUpdate;
+          Value *ParmUser;
+
+          if (isa<UnaryInstruction>(LinearParmUsers[I])) {
+            // Case 1
+            ParmUser = LinearParmUsers[I];
+            User::user_iterator StrideUserIt = LinearParmUsers[I]->user_begin();
+            User::user_iterator StrideUserEnd = LinearParmUsers[I]->user_end();
+
+            // Find the users of the redefinition of the parameter so that we
+            // can apply the stride to those instructions.
+            for (; StrideUserIt != StrideUserEnd; ++StrideUserIt) {
+
+              Instruction *StrideUser = dyn_cast<Instruction>(*StrideUserIt);
+              if (StrideUser != StrideInst) {
+                // We've already inserted the stride which is now also a user of
+                // the parameter, so don't update that instruction. Otherwise,
+                // we'll create a self reference. Hence, why we don't use
+                // replaceAllUsesWith().
+                InstsToUpdate.push_back(StrideUser);
+              }
+            }
+          } else {
+            // Case 2
+            ParmUser = &*ArgListIt;
+            InstsToUpdate.push_back(LinearParmUsers[I]);
+          }
+
+          // Replace the old references to the parameter with the instruction
+          // that applies the stride.
+          for (unsigned J = 0; J < InstsToUpdate.size(); ++J) {
+            unsigned NumOps = InstsToUpdate[J]->getNumOperands();
+            for (unsigned K = 0; K < NumOps; ++K) {
+              if (InstsToUpdate[J]->getOperand(K) == ParmUser) {
+                InstsToUpdate[J]->setOperand(K, StrideInst);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "After Linear Updates\n");
+  DEBUG(Clone->dump());
+}
+
+void VecClone::updateReturnBlockInstructions(
+    Function *Clone,
+    BasicBlock *ReturnBlock,
+    Instruction *ExpandedReturn)
+{
+  // If the vector function returns void, then there is no need to do any
+  // packing. The only instruction in the ReturnBlock is 'ret void', so
+  // we can just leave this instruction and we're done.
+  if (Clone->getReturnType()->isVoidTy()) return;
+
+  // Collect all instructions in the return basic block. They will be removed.
+  SmallVector<Instruction*, 4> InstToRemove;
+  BasicBlock::iterator InstIt = ReturnBlock->begin();
+  BasicBlock::iterator InstEnd = ReturnBlock->end();
+
+  for (; InstIt != InstEnd; ++InstIt) {
+    InstToRemove.push_back(&*InstIt);
+  }
+
+  // Remove all instructions from the return block. These will be replaced
+  // with the instructions necessary to return a vector temp. The verifier
+  // will complain if we remove the definitions of users first, so remove
+  // instructions from the bottom up.
+  for (int I = InstToRemove.size() - 1; I >= 0; I--) {
+    InstToRemove[I]->eraseFromParent();
+  }
+
+  // Pack up the elements into a vector temp and return it. If the return
+  // vector was bitcast to a pointer to the element type, we must bitcast to
+  // vector before returning.
+  Instruction *Return;
+  if (dyn_cast<BitCastInst>(ExpandedReturn)) {
+      // Operand 0 is the actual alloc reference in the bitcast.
+      AllocaInst *Alloca = dyn_cast<AllocaInst>(ExpandedReturn->getOperand(0));
+      PointerType *PtrVecType =
+          PointerType::get(Clone->getReturnType(),
+                           Alloca->getType()->getAddressSpace());
+      Twine CastName = "vec." + ExpandedReturn->getName();
+      BitCastInst *BitCast = new BitCastInst(ExpandedReturn, PtrVecType,
+                                             CastName, ReturnBlock);
+      Return = BitCast;
+  } else {
+      Return = ExpandedReturn;
+  }
+
+  LoadInst *VecReturn = new LoadInst(Return, "vec.ret", ReturnBlock);
+  ReturnInst::Create(Clone->getContext(), VecReturn, ReturnBlock);
+
+  DEBUG(dbgs() << "After Return Block Update\n");
+  DEBUG(Clone->dump());
+}
+
+int VecClone::getParmIndexInFunction(Function *F, Value *Parm)
+{
+  Function::arg_iterator ArgIt = F->arg_begin();
+  Function::arg_iterator ArgEnd = F->arg_end();
+  for (unsigned Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) {
+    if (Parm == &*ArgIt) return Idx; 
+  }
+
+  return -1;
+}
+
+bool VecClone::isSimpleFunction(Function *Clone, VectorVariant &V,
+                                ReturnInst *ReturnOnly)
+{
+  // For really simple functions, there is no need to go through the process
+  // of inserting a loop.
+
+  // Example:
+  //
+  // void foo(void) {
+  //   return;
+  // }
+  // 
+  // No need to insert a loop for this case since it's basically a no-op. Just
+  // clone the function and return. It's possible that we could have some code
+  // inside of a vector function that modifies global memory. Let that case go
+  // through.
+  if (ReturnOnly && Clone->getReturnType()->isVoidTy()) {
+    return true;
+  }
+
+  return false;
+}
+
+void VecClone::insertSplitForMaskedVariant(Function *Clone,
+                                           BasicBlock *LoopBlock,
+                                           BasicBlock *LoopExitBlock,
+                                           Instruction *Mask, PHINode *Phi)
+{
+  BasicBlock *LoopThenBlock =
+      LoopBlock->splitBasicBlock(LoopBlock->getFirstNonPHI(),
+                                 "simd.loop.then");
+
+  BasicBlock *LoopElseBlock = BasicBlock::Create(Clone->getContext(),
+                                                 "simd.loop.else",
+                                                 Clone, LoopExitBlock);
+
+  BranchInst::Create(LoopExitBlock, LoopElseBlock);
+
+  BitCastInst *BitCast = dyn_cast<BitCastInst>(Mask);
+  PointerType *BitCastType = dyn_cast<PointerType>(BitCast->getType());
+  Type *PointeeType = BitCastType->getElementType();
+
+  GetElementPtrInst *MaskGep =
+      GetElementPtrInst::Create(PointeeType, Mask, Phi, "mask.gep",
+                                LoopBlock->getTerminator());
+
+  Twine LoadName = "mask.parm";
+  LoadInst *MaskLoad = new LoadInst(MaskGep, LoadName,
+                                    LoopBlock->getTerminator());
+
+  Type *CompareTy = MaskLoad->getType();
+  Instruction *MaskCmp;
+  Constant* Zero;
+
+  // Generate the compare instruction to see if the mask bit is on. In ICC, we
+  // use the movemask intrinsic which takes both float/int mask registers and
+  // converts to an integer scalar value, one bit representing each element.
+  // AVR construction will be complicated if this intrinsic is introduced here,
+  // so the current solution is to just generate either an integer or floating
+  // point compare instruction for now. This may change anyway if we decide to
+  // go to a vector of i1 values for the mask. I suppose this would be one
+  // positive reason to use vector of i1.
+  if (CompareTy->isIntegerTy()) {
+    Zero = getConstantValue(CompareTy, Clone->getContext(), 0);
+    MaskCmp = new ICmpInst(LoopBlock->getTerminator(), CmpInst::ICMP_NE,
+                           MaskLoad, Zero, "mask.cond");
+  } else if (CompareTy->isFloatingPointTy()) {
+    Zero = getConstantValue(CompareTy, Clone->getContext(), 0.0);
+    MaskCmp = new FCmpInst(LoopBlock->getTerminator(), CmpInst::FCMP_UNE,
+                           MaskLoad, Zero, "mask.cond");
+  } else {
+    assert(0 && "Unsupported mask compare");
+  }
+
+  TerminatorInst *Term = LoopBlock->getTerminator();
+  Term->eraseFromParent();
+  BranchInst::Create(LoopThenBlock, LoopElseBlock, MaskCmp, LoopBlock);
+
+  DEBUG(dbgs() << "After Split Insertion For Masked Variant\n");
+  DEBUG(Clone->dump());
+}
+
+void VecClone::removeScalarAllocasForVectorParams(
+    std::vector<ParmRef*> &VectorParmMap)
+{
+  std::vector<ParmRef*>::iterator VectorParmMapIt;
+
+  for (auto VectorParmMapIt : VectorParmMap) {
+    Value *Parm = VectorParmMapIt->VectorParm;
+    if (AllocaInst *ScalarAlloca = dyn_cast<AllocaInst>(Parm)) {
+      ScalarAlloca->eraseFromParent();
+    }
+  }
+}
+
+void VecClone::disableLoopUnrolling(BasicBlock *Latch)
+{
+  // Set disable unroll metadata on the conditional branch of the loop latch
+  // for the simd loop. The following is an example of what the loop latch
+  // and Metadata will look like. The !llvm.loop marks the beginning of the
+  // loop Metadata and is always placed on the terminator of the loop latch.
+  // (i.e., simd.loop.exit in this case). According to LLVM documentation, to
+  // properly set the loop Metadata, the 1st operand of !16 must be a self-
+  // reference to avoid some type of Metadata merging conflicts that have
+  // apparently arisen in the past. This is part of LLVM history that I do not
+  // know. Also, according to LLVM documentation, any Metadata nodes referring
+  // to themselves are marked as distinct. As such, all Metadata corresponding
+  // to a loop belongs to that loop alone and no sharing of Metadata can be
+  // done across different loops.
+  //
+  // simd.loop.exit:        ; preds = %simd.loop, %if.else, %if.then
+  //  %indvar = add nuw i32 %index, 1
+  //  %vl.cond = icmp ult i32 %indvar, 2
+  //  br i1 %vl.cond, label %simd.loop, label %simd.end.region, !llvm.loop !16
+  //
+  // !16 = distinct !{!16, !17}
+  // !17 = !{!"llvm.loop.unroll.disable"}
+
+  SmallVector<Metadata *, 4> MDs;
+
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+
+  // Add unroll(disable) metadata to disable future unrolling.
+  LLVMContext &Context = Latch->getContext();
+  SmallVector<Metadata *, 1> DisableOperands;
+  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  MDs.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  Latch->getTerminator()->setMetadata("llvm.loop", NewLoopID);
+}
+
+bool VecClone::runOnModule(Module &M) {
+
+  DEBUG(dbgs() << "\nExecuting SIMD Function Cloning ...\n\n");
+
+  FunctionVariants FunctionsToVectorize;
+  getFunctionsToVectorize(M, FunctionsToVectorize);
+  if (FunctionsToVectorize.empty()) {
+    // No vector variants for this function exist.
+    return false;
+  }
+
+  for (auto& pair : FunctionsToVectorize) {
+
+    Function& F = *pair.first;
+    DeclaredVariants &DeclaredVariants = pair.second;
+
+    for (auto& DeclaredVariant : DeclaredVariants) {
+
+      VectorVariant Variant(DeclaredVariant);
+
+      // Clone the original function.
+      DEBUG(dbgs() << "Before SIMD Function Cloning\n");
+      DEBUG(F.dump());
+      Function *Clone = cloneFunction(F, Variant);
+      Function::iterator EntryBlock = Clone->begin();
+      BasicBlock::iterator FirstInst = EntryBlock->begin();
+      ReturnInst *ReturnOnly = dyn_cast<ReturnInst>(FirstInst);
+
+      if (isSimpleFunction(Clone, Variant, ReturnOnly)) {
+        continue;
+      }
+
+      BasicBlock *LoopBlock = splitEntryIntoLoop(Clone, Variant, &*EntryBlock);
+      BasicBlock *ReturnBlock = splitLoopIntoReturn(Clone, LoopBlock);
+      BasicBlock *LoopExitBlock = createLoopExit(Clone, ReturnBlock);
+      PHINode *Phi = createPhiAndBackedgeForLoop(Clone, &*EntryBlock,
+                                                 LoopBlock, LoopExitBlock,
+                                                 ReturnBlock,
+                                                 Variant.getVlen());
+
+      // At this point, we've gathered some parameter information and have
+      // restructured the function into an entry block, a set of blocks
+      // forming the loop, a loop exit block, and a return block. Now,
+      // we can go through and update instructions since we know what
+      // is part of the loop.
+
+      // VectorParmMap contains the mapping of the parameter to the bitcast
+      // instruction that casts the vector alloca for vector parameters
+      // to a scalar pointer for use in the simd loop. When parameters are
+      // registerized, the Value* in the map correponds directly to the
+      // function parameter. When parameters are not registerized, then the
+      // Value* in the map is the original scalar alloca before expansion.
+      // Later, users of the parameter, either directly or through the alloca,
+      // are replaced with a gep using the bitcast of the vector alloca for the
+      // parameter and the current loop induction variable value.
+      //
+      // IMPORTANT NOTE: std::vector was used here because later we emit LLVM
+      // instructions using the members of ParmRef, and these instructions
+      // should be ordered consistently for easier testability.
+
+      std::vector<ParmRef*> VectorParmMap;
+
+      // Create a new vector alloca instruction for all vector parameters and
+      // return. For parameters, replace the initial store to the old alloca
+      // with the vector one. Users of the old alloca within the loop will be
+      // replaced with a gep using this address along with the proper loop
+      // index.
+
+      Instruction *Mask = NULL;
+      Instruction *ExpandedReturn =
+          expandVectorParametersAndReturn(Clone, Variant, &Mask, &*EntryBlock,
+                                          LoopBlock, ReturnBlock,
+                                          VectorParmMap);
+      updateScalarMemRefsWithVector(Clone, F, &*EntryBlock, ReturnBlock, Phi,
+                                    VectorParmMap);
+
+      // Update any linear variables with the appropriate stride. This function
+      // will insert a mul/add sequence before the use of the parameter. For
+      // linear pointer parameters, the stride calculation is just a mul
+      // instruction using the loop induction var and the stride value on the
+      // parameter. This mul instruction is then used as the index of the gep
+      // that will be inserted before the next use of the parameter. The
+      // function also updates the users of the parameter with the new
+      // calculation involving the stride.
+      updateLinearReferences(Clone, F, Variant, Phi);
+
+      // Remove the old scalar instructions associated with the return and
+      // replace with packing instructions.
+      updateReturnBlockInstructions(Clone, ReturnBlock, ExpandedReturn);
+
+      // Remove the old scalar allocas associated with vector parameters since
+      // these have now been replaced with vector ones.
+      removeScalarAllocasForVectorParams(VectorParmMap);
+      VectorParmMap.clear();
+
+      // If this is the masked vector variant, insert the mask condition and
+      // if/else blocks.
+      if (Variant.isMasked()) {
+        insertSplitForMaskedVariant(Clone, LoopBlock, LoopExitBlock, Mask, Phi);
+      }
+
+      DEBUG(dbgs() << "After SIMD Function Cloning\n");
+      DEBUG(Clone->dump());
+
+      // Disable unrolling from kicking in on the simd loop.
+      disableLoopUnrolling(LoopExitBlock);
+
+    } // End of function cloning for the variant
+  } // End of function cloning for all variants
+
+  return true; // LLVM IR has been modified
+}
+
+void VecClone::print(raw_ostream &OS, const Module *M) const {
+  // TODO
+}
+
+ModulePass *llvm::createVecClonePass() {
+  return new llvm::VecClone();
+}
+
+char VecClone::ID = 0;
+
+static const char lv_name[] = "VecClone";
+INITIALIZE_PASS_BEGIN(VecClone, SV_NAME, lv_name,
+                      false /* modifies CFG */, false /* transform pass */)
+INITIALIZE_PASS_END(VecClone, SV_NAME, lv_name,
+                    false /* modififies CFG */, false /* transform pass */)
Index: test/Transforms/VecClone/broadcast.ll
===================================================================
--- test/Transforms/VecClone/broadcast.ll
+++ test/Transforms/VecClone/broadcast.ll
@@ -0,0 +1,19 @@
+; Check broadcast of a constant. The store of the constant should be moved inside of the loop.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4_foo
+; CHECK: simd.loop:
+; CHECK: store i32 99, i32* %ret.cast.gep
+
+; ModuleID = 'foo.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo() #0 {
+entry:
+  ret i32 99
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4_foo" "_ZGVbN4_foo" "_ZGVcM8_foo" "_ZGVcN8_foo" "_ZGVdM8_foo" "_ZGVdN8_foo" "_ZGVeM16_foo" "_ZGVeN16_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/convert_linear.ll
===================================================================
--- test/Transforms/VecClone/convert_linear.ll
+++ test/Transforms/VecClone/convert_linear.ll
@@ -0,0 +1,32 @@
+; Check handling of upconverting a linear (variable %i) to ensure stride calculation
+; is inserted correctly and the old convert (sext) uses the stride instead of the old
+; reference to %i.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN2vl_foo
+; CHECK: simd.loop:
+; CHECK: %0 = load i32, i32* %i.addr
+; CHECK-NEXT: %stride.mul = mul i32 1, %index
+; CHECK-NEXT: %stride.add = add i32 %0, %stride.mul
+; CHECK-NEXT: %conv = sext i32 %stride.add to i64
+
+; ModuleID = 'convert.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i64 @foo(i64 %x, i32 %i) #0 {
+entry:
+  %x.addr = alloca i64, align 8
+  %i.addr = alloca i32, align 4
+  store i64 %x, i64* %x.addr, align 8
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = load i64, i64* %x.addr, align 8
+  %add = add nsw i64 %conv, %1
+  ret i64 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM2vl_foo" "_ZGVbN2vl_foo" "_ZGVcM4vl_foo" "_ZGVcN4vl_foo" "_ZGVdM4vl_foo" "_ZGVdN4vl_foo" "_ZGVeM8vl_foo" "_ZGVeN8vl_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/external_array.ll
===================================================================
--- test/Transforms/VecClone/external_array.ll
+++ test/Transforms/VecClone/external_array.ll
@@ -0,0 +1,35 @@
+; Check to see that we are applying the correct updated linear index for an external array access gep.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4ul_foo
+; CHECK: simd.loop:
+; CHECK: %1 = load i32, i32* %i.addr
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK: %stride.add = add i32 %1, %stride.mul
+; CHECK: %idxprom = sext i32 %stride.add to i64
+; CHECK: %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom
+; CHECK: store i32 %0, i32* %arrayidx
+
+; ModuleID = 'external_array_assign.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@ext_a = common global [128 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %i) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %i.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %i.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom
+  store i32 %0, i32* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "_ZGVbM4ul_foo" "_ZGVbN4ul_foo" "_ZGVcM8ul_foo" "_ZGVcN8ul_foo" "_ZGVdM8ul_foo" "_ZGVdN8ul_foo" "_ZGVeM16ul_foo" "_ZGVeN16ul_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/linear.ll
===================================================================
--- test/Transforms/VecClone/linear.ll
+++ test/Transforms/VecClone/linear.ll
@@ -0,0 +1,29 @@
+; Check to see that the linear parameter i is updated with the correct stride, indicated by a mul/add instruction sequence after the load.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4lu_foo
+; CHECK: simd.loop:
+; CHECK: %1 = load i32, i32* %i.addr
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK: %stride.add = add i32 %1, %stride.mul
+; CHECK: %add = add nsw i32 %0, %stride.add
+
+; ModuleID = 'linear.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %i, i32 %x) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %x, i32* %x.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %i.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4lu_foo" "_ZGVbN4lu_foo" "_ZGVcM8lu_foo" "_ZGVcN8lu_foo" "_ZGVdM8lu_foo" "_ZGVdN8lu_foo" "_ZGVeM16lu_foo" "_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/linear_mem2reg.ll
===================================================================
--- test/Transforms/VecClone/linear_mem2reg.ll
+++ test/Transforms/VecClone/linear_mem2reg.ll
@@ -0,0 +1,22 @@
+; Check to see that the linear parameter i is updated with the correct stride when Mem2Reg is on.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4lu_foo
+; CHECK: simd.loop:
+; CHECK: %stride.mul = mul i32 1, %index
+; CHECK-NEXT: %stride.add = add i32 %i, %stride.mul
+; CHECK-NEXT: %add = add nsw i32 %x, %stride.add
+
+;ModuleID = 'linear.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %i, i32 %x) #0 {
+entry:
+  %add = add nsw i32 %x, %i
+  ret i32 %add
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4lu_foo" "_ZGVbN4lu_foo" "_ZGVcM8lu_foo" "_ZGVcN8lu_foo" "_ZGVdM8lu_foo" "_ZGVdN8lu_foo" "_ZGVeM16lu_foo" "_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/struct_linear_ptr.ll
===================================================================
--- test/Transforms/VecClone/struct_linear_ptr.ll
+++ test/Transforms/VecClone/struct_linear_ptr.ll
@@ -0,0 +1,40 @@
+; Test that the stride is being applied correctly to struct field accesses.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: @_ZGVbN4l_foo
+; CHECK: simd.loop:
+; CHECK: %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %0, i32 %stride.mul{{.*}}
+; CHECK: %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 0
+; CHECK: %1 = load float, float* %field1, align 8
+; CHECK: %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+; CHECK: %stride.mul{{.*}} = mul i32 1, %index
+; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %2, i32 %stride.mul{{.*}}
+; CHECK: %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 4
+; CHECK: %3 = load float, float* %field5, align 8
+; CHECK: %add = fadd float %1, %3
+
+; ModuleID = 'struct_linear_ptr.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.my_struct = type { float, i8, i32, i16, float, i64 }
+
+; Function Attrs: nounwind uwtable
+define float @foo(%struct.my_struct* %s) #0 {
+entry:
+  %s.addr = alloca %struct.my_struct*, align 8
+  store %struct.my_struct* %s, %struct.my_struct** %s.addr, align 8
+  %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+  %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %0, i32 0, i32 0
+  %1 = load float, float* %field1, align 8
+  %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8
+  %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %2, i32 0, i32 4
+  %3 = load float, float* %field5, align 8
+  %add = fadd float %1, %3
+  ret float %add
+}
+
+attributes #0 = { norecurse nounwind readonly uwtable "_ZGVbM4l_foo" "_ZGVbN4l_foo" "_ZGVcM8l_foo" "_ZGVcN8l_foo" "_ZGVdM8l_foo" "_ZGVdN8l_foo" "_ZGVeM16l_foo" "_ZGVeN16l_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum.ll
+++ test/Transforms/VecClone/two_vec_sum.ll
@@ -0,0 +1,59 @@
+; Do a sanity check on the structure of the LLVM that VecClone produces for the non-masked variant.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; Begin non-masked variant checking
+; NOTE: This test checks order very strictly and can change depending on optimization level used.
+;       FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced.
+
+; CHECK-LABEL: <4 x i32> @_ZGVbN4vv_vec_sum(<4 x i32> %i, <4 x i32> %j)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %vec.i = alloca <4 x i32>
+; CHECK-NEXT: %vec.j = alloca <4 x i32>
+; CHECK-NEXT: %vec.retval = alloca <4 x i32>
+; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i
+; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j
+; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32*
+; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32*
+; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32*
+; CHECK-NEXT: br label %simd.loop
+
+; CHECK: simd.loop:
+; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4
+; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4
+; CHECK-NEXT: %add = add nsw i32 %0, %1
+; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.exit:
+; CHECK-NEXT: %indvar = add nuw i32 %index, 1
+; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4
+; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return
+
+; CHECK: return:
+; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>*
+; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast
+; CHECK-NEXT: ret <4 x i32> %vec.ret
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum_mask.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum_mask.ll
+++ test/Transforms/VecClone/two_vec_sum_mask.ll
@@ -0,0 +1,71 @@
+; Do a sanity check on the structure of the LLVM that VecClone produces for the masked variant.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+; NOTE: This test checks order very strictly and can change depending on optimization level used.
+;       FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced.
+
+; Begin non-masked variant checking
+
+; CHECK-LABEL: <4 x i32> @_ZGVbM4vv_vec_sum(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask)
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %vec.i = alloca <4 x i32>
+; CHECK-NEXT: %vec.j = alloca <4 x i32>
+; CHECK-NEXT: %vec.mask = alloca <4 x i32>
+; CHECK-NEXT: %vec.retval = alloca <4 x i32>
+; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i, align 4
+; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j, align 4
+; CHECK-NEXT: store <4 x i32> %mask, <4 x i32>* %vec.mask
+; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32*
+; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32*
+; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32*
+; CHECK-NEXT: %mask.cast = bitcast <4 x i32>* %vec.mask to i32*
+; CHECK-NEXT: br label %simd.loop
+
+; CHECK: simd.loop:
+; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ]
+; CHECK-NEXT: %mask.gep = getelementptr i32, i32* %mask.cast, i32 %index
+; CHECK-NEXT: %mask.parm = load i32, i32* %mask.gep
+; CHECK-NEXT: %mask.cond = icmp ne i32 %mask.parm, 0
+; CHECK-NEXT: br i1 %mask.cond, label %simd.loop.then, label %simd.loop.else
+
+; CHECK: simd.loop.then:
+; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4
+; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4
+; CHECK-NEXT: %add = add nsw i32 %0, %1
+; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index
+; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.else:
+; CHECK-NEXT: br label %simd.loop.exit
+
+; CHECK: simd.loop.exit:
+; CHECK-NEXT: %indvar = add nuw i32 %index, 1
+; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4
+; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return
+
+; CHECK: return:
+; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>*
+; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast
+; CHECK-NEXT: ret <4 x i32> %vec.ret
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/two_vec_sum_mem2reg.ll
===================================================================
--- test/Transforms/VecClone/two_vec_sum_mem2reg.ll
+++ test/Transforms/VecClone/two_vec_sum_mem2reg.ll
@@ -0,0 +1,31 @@
+; Check to be sure that when Mem2Reg is on that all updates to instructions referring to the original
+; parameter are updated correctly. When Mem2Reg is on, instructions will refer to the parameters
+; directly and not through a load, which is why this is tested separately.
+
+; Note: the LLVM IR used as input to this test has already had Mem2Reg applied to it, so no need to
+; do that here. This happens at higher optimization levels such as -O2.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; Begin non-masked variant checking
+
+; CHECK-LABEL: @_ZGVbN4vv_vec_sum
+; CHECK: simd.loop:
+; CHECK: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index
+; CHECK: %vec.i.elem = load i32, i32* %vec.i.cast.gep
+; CHECK: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index
+; CHECK: %vec.j.elem = load i32, i32* %vec.j.cast.gep
+; CHECK: %add = add nsw i32 %vec.i.elem, %vec.j.elem
+
+; ModuleID = 'two_vec_sum.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @vec_sum(i32 %i, i32 %j) #0 {
+entry:
+  %add = add nsw i32 %i, %j
+  ret i32 %add
+}
+
+attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/uniform.ll
===================================================================
--- test/Transforms/VecClone/uniform.ll
+++ test/Transforms/VecClone/uniform.ll
@@ -0,0 +1,25 @@
+; Check to make sure the initial parameter store of the uniform parameter is sunk into the loop.
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: <4 x i32> @_ZGVbN4u_foo(i32 %b)
+; CHECK: simd.loop:
+; CHECK: store i32 %b
+
+; ModuleID = 'uniform.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %b) #0 {
+entry:
+  %b.addr = alloca i32, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32, i32* %b.addr, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %b.addr, align 4
+  %1 = load i32, i32* %b.addr, align 4
+  ret i32 %1
+}
+
+attributes #0 = { nounwind uwtable "_ZGVbM4u_foo" "_ZGVbN4u_foo" "_ZGVcM8u_foo" "_ZGVcN8u_foo" "_ZGVdM8u_foo" "_ZGVdN8u_foo" "_ZGVeM16u_foo" "_ZGVeN16u_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/VecClone/void_foo.ll
===================================================================
--- test/Transforms/VecClone/void_foo.ll
+++ test/Transforms/VecClone/void_foo.ll
@@ -0,0 +1,19 @@
+; Check to make sure we can handle void foo() function
+
+; RUN: opt -vec-clone -S < %s | FileCheck %s
+
+; CHECK-LABEL: void @_ZGVbN4_foo()
+; CHECK: entry:
+; CHECK: ret void
+
+; ModuleID = 'foo.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @foo() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "_ZGVbM4_foo1" "_ZGVbN4_foo1" "_ZGVcM8_foo1" "_ZGVcN8_foo1" "_ZGVdM8_foo1" "_ZGVdN8_foo1" "_ZGVeM16_foo1" "_ZGVeN16_foo1" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: tools/bugpoint/bugpoint.cpp
===================================================================
--- tools/bugpoint/bugpoint.cpp
+++ tools/bugpoint/bugpoint.cpp
@@ -130,6 +130,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeVecClonePass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -357,6 +357,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeVecClonePass(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeCodeGenPreparePass(Registry);