Index: include/llvm-c/Transforms/Scalar.h
===================================================================
--- include/llvm-c/Transforms/Scalar.h
+++ include/llvm-c/Transforms/Scalar.h
@@ -41,6 +41,9 @@
 /** See llvm::createDeadStoreEliminationPass function. */
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM);
 
+/** See llvm::createDecomposeVectorsPass function. */
+void LLVMAddDecomposeVectorsPass(LLVMPassManagerRef PM);
+
 /** See llvm::createGVNPass function. */
 void LLVMAddGVNPass(LLVMPassManagerRef PM);
 
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -321,6 +321,11 @@
     OK_UniformConstantValue // Operand is uniform constant.
   };
 
+  /// \returns True if the target prefers vector operations to be decomposed
+  /// into scalar operations at the IR level.  This can help on targets
+  /// with no vector support.
+  virtual bool shouldDecomposeVectors() const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -119,6 +119,7 @@
 void initializeMemorySanitizerPass(PassRegistry&);
 void initializeThreadSanitizerPass(PassRegistry&);
 void initializeDataFlowSanitizerPass(PassRegistry&);
+void initializeDecomposeVectorsPass(PassRegistry&);
 void initializeEarlyCSEPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeFindUsedTypesPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -153,6 +153,7 @@
       (void) llvm::createSLPVectorizerPass();
       (void) llvm::createBBVectorizePass();
       (void) llvm::createPartiallyInlineLibCallsPass();
+      (void) llvm::createDecomposeVectorsPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::FindUsedTypes();
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -355,6 +355,12 @@
 //
 FunctionPass *createPartiallyInlineLibCallsPass();
 
+//===----------------------------------------------------------------------===//
+//
+// DecomposeVectorsPass - Decomposes vector operations into smaller pieces
+//
+FunctionPass *createDecomposeVectorsPass();
+
 } // End llvm namespace
 
 #endif
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -158,6 +158,10 @@
   return PrevTTI->getIntImmCost(Imm, Ty);
 }
 
+bool TargetTransformInfo::shouldDecomposeVectors() const {
+  return PrevTTI->shouldDecomposeVectors();
+}
+
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return PrevTTI->getNumberOfRegisters(Vector);
 }
@@ -537,6 +541,10 @@
     return 1;
   }
 
+  bool shouldDecomposeVectors() const {
+    return false;
+  }
+
   unsigned getNumberOfRegisters(bool Vector) const {
     return 8;
   }
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -178,6 +178,7 @@
     MPM.add(createSROAPass(/*RequiresDomTree*/ false));
   else
     MPM.add(createScalarReplAggregatesPass(-1, false));
+  MPM.add(createDecomposeVectorsPass());
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   MPM.add(createJumpThreadingPass());         // Thread jumps.
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -5,6 +5,7 @@
   CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
+  DecomposeVectors.cpp
   EarlyCSE.cpp
   GlobalMerge.cpp
   GVN.cpp
Index: lib/Transforms/Scalar/DecomposeVectors.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/DecomposeVectors.cpp
@@ -0,0 +1,563 @@
+//===--- DecomposeVectors.cpp - Decompose vector operations ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "decompose-vectors"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+// Used to store the scattered form of a vector.
+typedef SmallVector<Value *, 8> ValueVector;
+
+// Used to map a vector Value to its scattered form.  We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+typedef std::map<Value *, ValueVector> ScatterMap;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+  // Scatter V into Size components.  If new instructions are needed,
+  // insert them before BBI in BB.  If Cache is nonnull, use it to cache
+  // the results.
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+            ValueVector *cachePtr = 0);
+
+  // Return component I, creating a new Value for it if necessary.
+  Value *operator[](unsigned I);
+
+  // Return the number of components.
+  unsigned size() const { return Size; }
+
+private:
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+  Value *V;
+  ValueVector *CachePtr;
+  PointerType *PtrTy;
+  ValueVector Tmp;
+  unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+  FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+  }
+  FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+  ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+  }
+  ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+  BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+  }
+  BinaryOperator &BO;
+};
+
+class DecomposeVectors : public FunctionPass,
+                         public InstVisitor<DecomposeVectors, bool> {
+public:
+  static char ID;
+
+  DecomposeVectors() :
+    FunctionPass(ID) {
+    initializeDecomposeVectorsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnFunction(Function &F);
+
+  // InstVisitor methods.  They return true if the instruction was decomposed,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &) { return false; }
+  bool visitSelectInst(SelectInst &SI);
+  bool visitICmpInst(ICmpInst &);
+  bool visitFCmpInst(FCmpInst &);
+  bool visitBinaryOperator(BinaryOperator &);
+  bool visitCastInst(CastInst &);
+  bool visitBitCastInst(BitCastInst &);
+  bool visitShuffleVectorInst(ShuffleVectorInst &);
+  bool visitPHINode(PHINode &);
+  bool visitLoadInst(LoadInst &);
+  bool visitStoreInst(StoreInst &);
+
+private:
+  Scatterer scatter(Instruction *, Value *);
+  void gather(Instruction *, const ValueVector &);
+  void transferMetadata(Instruction *, const ValueVector &);
+  bool finish();
+
+  template<typename T> bool splitBinary(Instruction &, const T &);
+
+  ScatterMap Scattered;
+  GatherList Gathered;
+};
+
+char DecomposeVectors::ID = 0;
+} // end anonymous namespace
+
+// Overrides the TargetTransformInfo preference.
+static cl::opt<bool> EnableDecomposeVectors
+  ("enable-decompose-vectors", cl::Hidden, cl::init(false),
+   cl::desc("Enable the DecomposeVectors pass, ignoring the target's"
+            " preference"));
+
+// This is disabled by default because having separate loads and stores makes
+// it more likely that the -combiner-alias-analysis limits will be reached.
+static cl::opt<bool> DecomposeVectorLoadStore
+  ("decompose-vector-load-store", cl::Hidden, cl::init(false),
+   cl::desc("Allow the decompose-vectors pass to decompose loads and store"));
+
+INITIALIZE_PASS(DecomposeVectors, "decompose-vectors",
+                "Decompose vector operations into smaller pieces",
+                false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+                     ValueVector *cachePtr)
+  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+  Type *Ty = V->getType();
+  PtrTy = dyn_cast<PointerType>(Ty);
+  if (PtrTy)
+    Ty = PtrTy->getElementType();
+  Size = Ty->getVectorNumElements();
+  if (!CachePtr)
+    Tmp.resize(Size, 0);
+  else if (CachePtr->empty())
+    CachePtr->resize(Size, 0);
+  else
+    assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+  ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+  // Try to reuse a previous value.
+  if (CV[I])
+    return CV[I];
+  IRBuilder<> Builder(BB, BBI);
+  if (PtrTy) {
+    if (!CV[0]) {
+      Type *Ty =
+        PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+                         PtrTy->getAddressSpace());
+      CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+    }
+    if (I != 0)
+      CV[I] = Builder.CreateConstGEP1_32(CV[0], I,
+                                         V->getName() + ".i" + Twine(I));
+  } else {
+    // Search through a chain of InsertElementInsts looking for element I.
+    // Record other elements in the cache.  The new V is still suitable
+    // for all uncached indices.
+    for (;;) {
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+      if (!Insert)
+        break;
+      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      if (!Idx)
+        break;
+      unsigned J = Idx->getZExtValue();
+      CV[J] = Insert->getOperand(1);
+      V = Insert->getOperand(0);
+      if (I == J)
+        return CV[J];
+    }
+    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+                                         V->getName() + ".i" + Twine(I));
+  }
+  return CV[I];
+}
+
+void DecomposeVectors::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool DecomposeVectors::runOnFunction(Function &F) {
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+  if (EnableDecomposeVectors.getNumOccurrences() > 0
+      ? !EnableDecomposeVectors
+      : !TTI->shouldDecomposeVectors())
+    return false;
+
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
+    BasicBlock *BB = BBI;
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+      Instruction *I = II;
+      bool Done = visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point.  V must be a
+// vector or a pointer to a vector.
+Scatterer DecomposeVectors::scatter(Instruction *Point, Value *V) {
+  if (Argument *VArg = dyn_cast<Argument>(V)) {
+    // Put the scattered form of arguments in the entry block,
+    // so that it can be used everywhere.
+    Function *F = VArg->getParent();
+    BasicBlock *BB = &F->getEntryBlock();
+    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+  }
+  if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // Put the scattered form of an instruction directly after the
+    // instruction.
+    BasicBlock *BB = VOp->getParent();
+    return Scatterer(BB, llvm::next(BasicBlock::iterator(VOp)),
+                     V, &Scattered[V]);
+  }
+  // In the fallback case, just put the scattered before Point and
+  // keep the result local to Point.
+  return Scatterer(Point->getParent(), Point, V);
+}
+
+// Replace Op with the gathered form of the components in CV.  Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void DecomposeVectors::gather(Instruction *Op, const ValueVector &CV) {
+  // Since we're not deleting Op yet, stub out its operands, so that it
+  // doesn't make anything live unnecessarily.
+  for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+    Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+  transferMetadata(Op, CV);
+
+  // If we already have a scattered form of Op (created from ExtractElements
+  // of Op itself), replace them with the new form.
+  ValueVector &SV = Scattered[Op];
+  if (!SV.empty()) {
+    for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+      Instruction *Old = cast<Instruction>(SV[I]);
+      CV[I]->takeName(Old);
+      Old->replaceAllUsesWith(CV[I]);
+      Old->eraseFromParent();
+    }
+  }
+  SV = CV;
+  Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Transfer metadata from Op to the instructions in CV.  At present there
+// are no tags that need to be dropped or altered.  The most important
+// ones to carry over are debug location, fpmath, tbaa and
+// llvm.mem.parallel_loop_access.
+void DecomposeVectors::transferMetadata(Instruction *Op,
+                                        const ValueVector &CV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  Op->getAllMetadata(MDs);
+  for (unsigned I = 0, E = CV.size(); I != E; ++I)
+    if (Instruction *New = dyn_cast<Instruction>(CV[I]))
+      for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
+             MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI)
+        New->setMetadata(MI->first, MI->second);
+}
+
+// Decompose two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool DecomposeVectors::splitBinary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(I.getParent(), &I);
+  Scatterer Op0 = scatter(&I, I.getOperand(0));
+  Scatterer Op1 = scatter(&I, I.getOperand(1));
+  assert(Op0.size() == NumElems && "Mismatched binary operation");
+  assert(Op1.size() == NumElems && "Mismatched binary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+    Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+                      I.getName() + ".i" + Twine(Elem));
+  gather(&I, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitSelectInst(SelectInst &SI) {
+  VectorType *VT = dyn_cast<VectorType>(SI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(SI.getParent(), &SI);
+  Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+  Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+  assert(Op1.size() == NumElems && "Mismatched select");
+  assert(Op2.size() == NumElems && "Mismatched select");
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (SI.getOperand(0)->getType()->isVectorTy()) {
+    Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+    assert(Op0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  } else {
+    Value *Op0 = SI.getOperand(0);
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  }
+  gather(&SI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitICmpInst(ICmpInst &ICI) {
+  return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool DecomposeVectors::visitFCmpInst(FCmpInst &FCI) {
+  return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool DecomposeVectors::visitBinaryOperator(BinaryOperator &BO) {
+  return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool DecomposeVectors::visitCastInst(CastInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(CI.getParent(), &CI);
+  Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+  assert(Op0.size() == NumElems && "Mismatched cast");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+                                CI.getName() + ".i" + Twine(I));
+  gather(&CI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitBitCastInst(BitCastInst &BCI) {
+  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+  if (!DstVT || !SrcVT)
+    return false;
+
+  unsigned DstNumElems = DstVT->getNumElements();
+  unsigned SrcNumElems = SrcVT->getNumElements();
+  IRBuilder<> Builder(BCI.getParent(), &BCI);
+  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+  ValueVector Res;
+  Res.resize(DstNumElems);
+
+  if (DstNumElems == SrcNumElems) {
+    for (unsigned I = 0; I < DstNumElems; ++I)
+      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+                                     BCI.getName() + ".i" + Twine(I));
+  } else if (DstNumElems > SrcNumElems) {
+    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
+    // individual elements to the destination.
+    unsigned FanOut = DstNumElems / SrcNumElems;
+    Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+    unsigned ResI = 0;
+    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+      Value *V = Op0[Op0I];
+      Instruction *VI;
+      // Look through any existing bitcasts before converting to <N x t2>.
+      // In the best case, the resulting conversion might be a no-op.
+      while ((VI = dyn_cast<Instruction>(V)) &&
+             VI->getOpcode() == Instruction::BitCast)
+        V = VI->getOperand(0);
+      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+      Scatterer Mid = scatter(&BCI, V);
+      for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+        Res[ResI++] = Mid[MidI];
+    }
+  } else {
+    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
+    unsigned FanIn = SrcNumElems / DstNumElems;
+    Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+    unsigned Op0I = 0;
+    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+      Value *V = UndefValue::get(MidTy);
+      for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+                                        BCI.getName() + ".i" + Twine(ResI)
+                                        + ".upto" + Twine(MidI));
+      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+                                        BCI.getName() + ".i" + Twine(ResI));
+    }
+  }
+  gather(&BCI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I) {
+    int Selector = SVI.getMaskValue(I);
+    if (Selector < 0)
+      Res[I] = UndefValue::get(VT->getElementType());
+    else if (unsigned(Selector) < Op0.size())
+      Res[I] = Op0[Selector];
+    else
+      Res[I] = Op1[Selector - Op0.size()];
+  }
+  gather(&SVI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitPHINode(PHINode &PHI) {
+  VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(PHI.getParent(), &PHI);
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  unsigned NumOps = PHI.getNumOperands();
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+                               PHI.getName() + ".i" + Twine(I));
+
+  for (unsigned I = 0; I < NumOps; ++I) {
+    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+    for (unsigned J = 0; J < NumElems; ++J)
+      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+  }
+  gather(&PHI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitLoadInst(LoadInst &LI) {
+  if (!DecomposeVectorLoadStore)
+    return false;
+
+  if (!LI.isSimple())
+    return false;
+
+  VectorType *VT = dyn_cast<VectorType>(LI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(LI.getParent(), &LI);
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  ValueVector Res;
+  Res.resize(NumElems);
+  
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateLoad(Ptr[I], LI.getName() + ".i" + Twine(I));
+  gather(&LI, Res);
+  return true;
+}
+
+bool DecomposeVectors::visitStoreInst(StoreInst &SI) {
+  if (!DecomposeVectorLoadStore)
+    return false;
+
+  if (!SI.isSimple())
+    return false;
+
+  Value *FullValue = SI.getValueOperand();
+  VectorType *VT = dyn_cast<VectorType>(FullValue->getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(SI.getParent(), &SI);
+  Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+  Scatterer Val = scatter(&SI, FullValue);
+
+  ValueVector Stores;
+  Stores.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Stores[I] = Builder.CreateStore(Val[I], Ptr[I]);
+  transferMetadata(&SI, Stores);
+  return true;
+}
+
+// Delete the instructions that we decomposed.  If a full vector result
+// is still needed, recreate it using InsertElements.
+bool DecomposeVectors::finish() {
+  if (Gathered.empty())
+    return false;
+  for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
+       GMI != GME; ++GMI) {
+    Instruction *Op = GMI->first;
+    ValueVector &CV = *GMI->second;
+    if (!Op->use_empty()) {
+      // The value is still needed, so recreate it using a series of
+      // InsertElements.
+      Type *Ty = Op->getType();
+      Value *Res = UndefValue::get(Ty);
+      unsigned Count = Ty->getVectorNumElements();
+      IRBuilder<> Builder(Op->getParent(), Op);
+      for (unsigned I = 0; I < Count; ++I)
+        Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                          Op->getName() + ".upto" + Twine(I));
+      Res->takeName(Op);
+      Op->replaceAllUsesWith(Res);
+    }
+    Op->eraseFromParent();
+  }
+  Gathered.clear();
+  Scattered.clear();
+  return true;
+}
+
+FunctionPass *llvm::createDecomposeVectorsPass() {
+  return new DecomposeVectors();
+}
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -33,6 +33,7 @@
   initializeCorrelatedValuePropagationPass(Registry);
   initializeDCEPass(Registry);
   initializeDeadInstEliminationPass(Registry);
+  initializeDecomposeVectorsPass(Registry);
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
   initializeEarlyCSEPass(Registry);
@@ -79,6 +80,10 @@
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
 
+void LLVMAddDecomposeVectorsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDecomposeVectorsPass());
+}
+
 void LLVMAddGVNPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createGVNPass());
 }
Index: test/Transforms/DecomposeVectors/basic.ll
===================================================================
--- /dev/null
+++ test/Transforms/DecomposeVectors/basic.ll
@@ -0,0 +1,243 @@
+; RUN: opt %s -decompose-vectors -enable-decompose-vectors \
+; RUN:     -decompose-vector-load-store -dce -S | FileCheck %s
+declare <4 x float> @ext(<4 x float>)
+
+define void @f1(<4 x float> %init, <4 x float> *%base, i32 %count) {
+; CHECK-LABEL: @f1(
+; CHECK: entry:
+; CHECK:   %init.i0 = extractelement <4 x float> %init, i32 0
+; CHECK:   %init.i1 = extractelement <4 x float> %init, i32 1
+; CHECK:   %init.i2 = extractelement <4 x float> %init, i32 2
+; CHECK:   %init.i3 = extractelement <4 x float> %init, i32 3
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %i = phi i32 [ %count, %entry ], [ %nexti, %loop ]
+; CHECK:   %acc.i0 = phi float [ %init.i0, %entry ], [ %sel.i0, %loop ]
+; CHECK:   %acc.i1 = phi float [ %init.i1, %entry ], [ %sel.i1, %loop ]
+; CHECK:   %acc.i2 = phi float [ %init.i2, %entry ], [ %sel.i2, %loop ]
+; CHECK:   %acc.i3 = phi float [ %init.i3, %entry ], [ %sel.i3, %loop ]
+; CHECK:   %nexti = sub i32 %i, 1
+; CHECK:   %ptr = getelementptr <4 x float>* %base, i32 %i
+; CHECK:   %ptr.i0 = bitcast <4 x float>* %ptr to float*
+; CHECK:   %val.i0 = load float* %ptr.i0
+; CHECK:   %ptr.i1 = getelementptr float* %ptr.i0, i32 1
+; CHECK:   %val.i1 = load float* %ptr.i1
+; CHECK:   %ptr.i2 = getelementptr float* %ptr.i0, i32 2
+; CHECK:   %val.i2 = load float* %ptr.i2
+; CHECK:   %ptr.i3 = getelementptr float* %ptr.i0, i32 3
+; CHECK:   %val.i3 = load float* %ptr.i3
+; CHECK:   %add.i0 = fadd float %val.i0, %val.i2
+; CHECK:   %add.i1 = fadd float %val.i1, %val.i3
+; CHECK:   %add.i2 = fadd float %acc.i0, %acc.i2
+; CHECK:   %add.i3 = fadd float %acc.i1, %acc.i3
+; CHECK:   %add.upto0 = insertelement <4 x float> undef, float %add.i0, i32 0
+; CHECK:   %add.upto1 = insertelement <4 x float> %add.upto0, float %add.i1, i32 1
+; CHECK:   %add.upto2 = insertelement <4 x float> %add.upto1, float %add.i2, i32 2
+; CHECK:   %add = insertelement <4 x float> %add.upto2, float %add.i3, i32 3
+; CHECK:   %call = call <4 x float> @ext(<4 x float> %add)
+; CHECK:   %call.i0 = extractelement <4 x float> %call, i32 0
+; CHECK:   %cmp.i0 = fcmp ogt float %call.i0, 1.0
+; CHECK:   %call.i1 = extractelement <4 x float> %call, i32 1
+; CHECK:   %cmp.i1 = fcmp ogt float %call.i1, 2.0
+; CHECK:   %call.i2 = extractelement <4 x float> %call, i32 2
+; CHECK:   %cmp.i2 = fcmp ogt float %call.i2, 3.0
+; CHECK:   %call.i3 = extractelement <4 x float> %call, i32 3
+; CHECK:   %cmp.i3 = fcmp ogt float %call.i3, 4.0
+; CHECK:   %sel.i0 = select i1 %cmp.i0, float %call.i0, float 5.0
+; CHECK:   %sel.i1 = select i1 %cmp.i1, float %call.i1, float 6.0
+; CHECK:   %sel.i2 = select i1 %cmp.i2, float %call.i2, float 7.0
+; CHECK:   %sel.i3 = select i1 %cmp.i3, float %call.i3, float 8.0
+; CHECK:   store float %sel.i0, float* %ptr.i0
+; CHECK:   store float %sel.i1, float* %ptr.i1
+; CHECK:   store float %sel.i2, float* %ptr.i2
+; CHECK:   store float %sel.i3, float* %ptr.i3
+; CHECK:   %test = icmp eq i32 %nexti, 0
+; CHECK:   br i1 %test, label %loop, label %exit
+; CHECK: exit:
+; CHECK:   ret void
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ %count, %entry ], [ %nexti, %loop ]
+  %acc = phi <4 x float> [ %init, %entry ], [ %sel, %loop ]
+  %nexti = sub i32 %i, 1
+
+  %ptr = getelementptr <4 x float> *%base, i32 %i
+  %val = load <4 x float> *%ptr
+  %dval = bitcast <4 x float> %val to <2 x double>
+  %dacc = bitcast <4 x float> %acc to <2 x double>
+  %shuffle1 = shufflevector <2 x double> %dval, <2 x double> %dacc,
+                            <2 x i32> <i32 0, i32 2>
+  %shuffle2 = shufflevector <2 x double> %dval, <2 x double> %dacc,
+                            <2 x i32> <i32 1, i32 3>
+  %f1 = bitcast <2 x double> %shuffle1 to <4 x float>
+  %f2 = bitcast <2 x double> %shuffle2 to <4 x float>
+  %add = fadd <4 x float> %f1, %f2
+  %call = call <4 x float> @ext(<4 x float> %add)
+  %cmp = fcmp ogt <4 x float> %call,
+                  <float 1.0, float 2.0, float 3.0, float 4.0>
+  %sel = select <4 x i1> %cmp, <4 x float> %call,
+                <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>
+  store <4 x float> %sel, <4 x float> *%ptr
+
+  %test = icmp eq i32 %nexti, 0
+  br i1 %test, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @f2(<4 x i32> %init, <4 x i8> *%base, i32 %count) {
+; CHECK-LABEL: define void @f2(<4 x i32> %init, <4 x i8>* %base, i32 %count) {
+; CHECK: entry:
+; CHECK:   %init.i0 = extractelement <4 x i32> %init, i32 0
+; CHECK:   %init.i1 = extractelement <4 x i32> %init, i32 1
+; CHECK:   %init.i2 = extractelement <4 x i32> %init, i32 2
+; CHECK:   %init.i3 = extractelement <4 x i32> %init, i32 3
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %i = phi i32 [ %count, %entry ], [ %nexti, %loop ]
+; CHECK:   %acc.i0 = phi i32 [ %init.i0, %entry ], [ %sel.i0, %loop ]
+; CHECK:   %acc.i1 = phi i32 [ %init.i1, %entry ], [ %sel.i1, %loop ]
+; CHECK:   %acc.i2 = phi i32 [ %init.i2, %entry ], [ %sel.i2, %loop ]
+; CHECK:   %acc.i3 = phi i32 [ %init.i3, %entry ], [ %sel.i3, %loop ]
+; CHECK:   %nexti = sub i32 %i, 1
+; CHECK:   %ptr = getelementptr <4 x i8>* %base, i32 %i
+; CHECK:   %ptr.i0 = bitcast <4 x i8>* %ptr to i8*
+; CHECK:   %val.i0 = load i8* %ptr.i0
+; CHECK:   %ptr.i1 = getelementptr i8* %ptr.i0, i32 1
+; CHECK:   %val.i1 = load i8* %ptr.i1
+; CHECK:   %ptr.i2 = getelementptr i8* %ptr.i0, i32 2
+; CHECK:   %val.i2 = load i8* %ptr.i2
+; CHECK:   %ptr.i3 = getelementptr i8* %ptr.i0, i32 3
+; CHECK:   %val.i3 = load i8* %ptr.i3
+; CHECK:   %ext.i0 = sext i8 %val.i0 to i32
+; CHECK:   %ext.i1 = sext i8 %val.i1 to i32
+; CHECK:   %ext.i2 = sext i8 %val.i2 to i32
+; CHECK:   %ext.i3 = sext i8 %val.i3 to i32
+; CHECK:   %add.i0 = add i32 %ext.i0, %acc.i0
+; CHECK:   %add.i1 = add i32 %ext.i1, %acc.i1
+; CHECK:   %add.i2 = add i32 %ext.i2, %acc.i2
+; CHECK:   %add.i3 = add i32 %ext.i3, %acc.i3
+; CHECK:   %cmp.i0 = icmp slt i32 %add.i0, -10
+; CHECK:   %cmp.i1 = icmp slt i32 %add.i1, -11
+; CHECK:   %cmp.i2 = icmp slt i32 %add.i2, -12
+; CHECK:   %cmp.i3 = icmp slt i32 %add.i3, -13
+; CHECK:   %sel.i0 = select i1 %cmp.i0, i32 %add.i0, i32 %i
+; CHECK:   %sel.i1 = select i1 %cmp.i1, i32 %add.i1, i32 %i
+; CHECK:   %sel.i2 = select i1 %cmp.i2, i32 %add.i2, i32 %i
+; CHECK:   %sel.i3 = select i1 %cmp.i3, i32 %add.i3, i32 %i
+; CHECK:   %trunc.i0 = trunc i32 %sel.i0 to i8
+; CHECK:   %trunc.i1 = trunc i32 %sel.i1 to i8
+; CHECK:   %trunc.i2 = trunc i32 %sel.i2 to i8
+; CHECK:   %trunc.i3 = trunc i32 %sel.i3 to i8
+; CHECK:   store i8 %trunc.i0, i8* %ptr.i0
+; CHECK:   store i8 %trunc.i1, i8* %ptr.i1
+; CHECK:   store i8 %trunc.i2, i8* %ptr.i2
+; CHECK:   store i8 %trunc.i3, i8* %ptr.i3
+; CHECK:   %test = icmp eq i32 %nexti, 0
+; CHECK:   br i1 %test, label %loop, label %exit
+; CHECK: exit:
+; CHECK:   ret void
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ %count, %entry ], [ %nexti, %loop ]
+  %acc = phi <4 x i32> [ %init, %entry ], [ %sel, %loop ]
+  %nexti = sub i32 %i, 1
+
+  %ptr = getelementptr <4 x i8> *%base, i32 %i
+  %val = load <4 x i8> *%ptr
+  %ext = sext <4 x i8> %val to <4 x i32>
+  %add = add <4 x i32> %ext, %acc
+  %cmp = icmp slt <4 x i32> %add, <i32 -10, i32 -11, i32 -12, i32 -13>
+  %single = insertelement <4 x i32> undef, i32 %i, i32 0
+  %limit = shufflevector <4 x i32> %single, <4 x i32> undef,
+                         <4 x i32> zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x i32> %add, <4 x i32> %limit
+  %trunc = trunc <4 x i32> %sel to <4 x i8>
+  store <4 x i8> %trunc, <4 x i8> *%ptr
+
+  %test = icmp eq i32 %nexti, 0
+  br i1 %test, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that TBAA information is preserved.
+define void @f3(<4 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: @f3(
+; CHECK: %val.i0 = load i32* %src.i0, !tbaa ![[TAG:[0-9]*]]
+; CHECK: %val.i1 = load i32* %src.i1, !tbaa ![[TAG]]
+; CHECK: %val.i2 = load i32* %src.i2, !tbaa ![[TAG]]
+; CHECK: %val.i3 = load i32* %src.i3, !tbaa ![[TAG]]
+; CHECK: store i32 %add.i0, i32* %dst.i0, !tbaa ![[TAG:[0-9]*]]
+; CHECK: store i32 %add.i1, i32* %dst.i1, !tbaa ![[TAG]]
+; CHECK: store i32 %add.i2, i32* %dst.i2, !tbaa ![[TAG]]
+; CHECK: store i32 %add.i3, i32* %dst.i3, !tbaa ![[TAG]]
+; CHECK: ret void
+  %val = load <4 x i32> *%src, !tbaa !1
+  %add = add <4 x i32> %val, %val
+  store <4 x i32> %add, <4 x i32> *%dst, !tbaa !2
+  ret void
+}
+
+; Check that llvm.mem.parallel_loop_access information is preserved.
+define void @f4(i32 %count, <4 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: @f4(
+; CHECK: %val.i0 = load i32* %this_src.i0, !llvm.mem.parallel_loop_access ![[TAG:[0-9]*]]
+; CHECK: %val.i1 = load i32* %this_src.i1, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: %val.i2 = load i32* %this_src.i2, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: %val.i3 = load i32* %this_src.i3, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: store i32 %add.i0, i32* %this_dst.i0, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: store i32 %add.i1, i32* %this_dst.i1, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: store i32 %add.i2, i32* %this_dst.i2, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: store i32 %add.i3, i32* %this_dst.i3, !llvm.mem.parallel_loop_access ![[TAG]]
+; CHECK: ret void
+entry:
+  br label %loop
+
+loop:
+  %index = phi i32 [ 0, %entry ], [ %next_index, %loop ]
+  %this_src = getelementptr <4 x i32> *%src, i32 %index
+  %this_dst = getelementptr <4 x i32> *%dst, i32 %index
+  %val = load <4 x i32> *%this_src, !llvm.mem.parallel_loop_access !3
+  %add = add <4 x i32> %val, %val
+  store <4 x i32> %add, <4 x i32> *%this_dst, !llvm.mem.parallel_loop_access !3
+  %next_index = add i32 %index, -1
+  %continue = icmp ne i32 %next_index, %count
+  br i1 %continue, label %loop, label %end, !llvm.loop !3
+
+end:
+  ret void
+}
+
+; Check that fpmath information is preserved.
+define <4 x float> @f5(<4 x float> %x) {
+; CHECK-LABEL: @f5(
+; CHECK: %x.i0 = extractelement <4 x float> %x, i32 0
+; CHECK: %res.i0 = fadd float %x.i0, 1.0{{[e+0]*}}, !fpmath ![[TAG:[0-9]*]]
+; CHECK: %x.i1 = extractelement <4 x float> %x, i32 1
+; CHECK: %res.i1 = fadd float %x.i1, 2.0{{[e+0]*}}, !fpmath ![[TAG]]
+; CHECK: %x.i2 = extractelement <4 x float> %x, i32 2
+; CHECK: %res.i2 = fadd float %x.i2, 3.0{{[e+0]*}}, !fpmath ![[TAG]]
+; CHECK: %x.i3 = extractelement <4 x float> %x, i32 3
+; CHECK: %res.i3 = fadd float %x.i3, 4.0{{[e+0]*}}, !fpmath ![[TAG]]
+; CHECK: %res.upto0 = insertelement <4 x float> undef, float %res.i0, i32 0
+; CHECK: %res.upto1 = insertelement <4 x float> %res.upto0, float %res.i1, i32 1
+; CHECK: %res.upto2 = insertelement <4 x float> %res.upto1, float %res.i2, i32 2
+; CHECK: %res = insertelement <4 x float> %res.upto2, float %res.i3, i32 3
+; CHECK: ret <4 x float> %res
+  %res = fadd <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>,
+    !fpmath !4
+  ret <4 x float> %res
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
+!3 = metadata !{ metadata !3 }
+!4 = metadata !{ float 4.0 }