Index: polly/trunk/include/polly/CodeGen/BlockGenerators.h
===================================================================
--- polly/trunk/include/polly/CodeGen/BlockGenerators.h
+++ polly/trunk/include/polly/CodeGen/BlockGenerators.h
@@ -63,6 +63,24 @@
 /// @brief Generate a new basic block for a polyhedral statement.
 class BlockGenerator {
 public:
+  /// @brief Map types to resolve scalar dependences.
+  ///
+  ///@{
+
+  /// @see The ScalarMap and PHIOpMap member.
+  using ScalarAllocaMapTy = DenseMap<Instruction *, AllocaInst *>;
+
+  /// @brief Simple vector of instructions to store escape users.
+  using EscapeUserVectorTy = SmallVector<Instruction *, 4>;
+
+  /// @brief Map type to resolve escaping users for scalar instructions.
+  ///
+  /// @see The EscapeMap member.
+  using EscapeUsersAllocaMapTy =
+      DenseMap<Instruction *, std::pair<AllocaInst *, EscapeUserVectorTy>>;
+
+  ///@}
+
   /// @brief Create a generator for basic blocks.
   ///
   /// @param Builder     The LLVM-IR Builder used to generate the statement. The
@@ -71,9 +89,14 @@
   /// @param LI          The loop info for the current function
   /// @param SE          The scalar evolution info for the current function
   /// @param DT          The dominator tree of this function.
+  /// @param ScalarMap   Map from scalars to their demoted location.
+  /// @param PHIOpMap    Map from PHIs to their demoted operand location.
+  /// @param EscapeMap   Map from scalars to their escape users and locations.
   /// @param ExprBuilder An expression builder to generate new access functions.
   BlockGenerator(PollyIRBuilder &Builder, LoopInfo &LI, ScalarEvolution &SE,
-                 DominatorTree &DT, IslExprBuilder *ExprBuilder = nullptr);
+                 DominatorTree &DT, ScalarAllocaMapTy &ScalarMap,
+                 ScalarAllocaMapTy &PHIOpMap, EscapeUsersAllocaMapTy &EscapeMap,
+                 IslExprBuilder *ExprBuilder = nullptr);
 
   /// @brief Copy the basic block.
   ///
@@ -87,6 +110,18 @@
   /// @param LTS       A map from old loops to new induction variables as SCEVs.
   void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT &LTS);
 
+  /// @brief Finalize the code generation for the SCoP @p S.
+  ///
+  /// This will initialize and finalize the scalar variables we demoted during
+  /// the code generation.
+  ///
+  /// @see createScalarInitialization(Region &, ValueMapT &)
+  /// @see createScalarFinalization(Region &)
+  void finalizeSCoP(Scop &S, ValueMapT &VMap);
+
+  /// @brief An empty destructor
+  virtual ~BlockGenerator(){};
+
 protected:
   PollyIRBuilder &Builder;
   LoopInfo &LI;
@@ -96,6 +131,42 @@
   /// @brief The dominator tree of this function.
   DominatorTree &DT;
 
+  /// @brief The entry block of the current function.
+  BasicBlock *EntryBB;
+
+  /// @brief Maps to resolve scalar dependences for PHI operands and scalars.
+  ///
+  /// Usage example:
+  ///
+  ///  x1 = ...   // x1 will be inserted in the ScalarMap and PhiOpMap.
+  ///  for (i=0...N) {
+  ///      x2 = phi(x1, add) // x2 will be inserted in the ScalarMap, x1 and
+  ///                        // add are mapped in the PHIOpMap.
+  ///      add = x2 + A[i];  // add will be inserted in the ScalarMap and
+  ///                        // the PhiOpMap.
+  ///  }
+  ///  print(x1)  // x1 is mapped in the ScalarMap.
+  ///  print(x2)  // x2 is mapped in the ScalarMap.
+  ///  print(add) // add is mapped in the ScalarMap.
+  ///
+  ///{
+
+  /// The PHIOpMap is used to get the alloca to communicate a value to a PHI
+  /// node, hence when the operand of a PHI is demoted the corresponding write
+  /// access will use the PHIOpMap to look for the correct alloca. PHI nodes
+  /// will then read that location in order to get the correct/current operand
+  /// value.
+  ScalarAllocaMapTy &PHIOpMap;
+
+  /// The ScalarMap is used in __all__ other cases, thus always when a scalar
+  /// variable is read/written and the write is not because the scalar is a PHI
+  /// operand.
+  ScalarAllocaMapTy &ScalarMap;
+  ///}
+
+  /// @brief Map from instructions to their escape users as well as the alloca.
+  EscapeUsersAllocaMapTy &EscapeMap;
+
   /// @brief Split @p BB to create a new one we can use to clone @p BB in.
   BasicBlock *splitBB(BasicBlock *BB);
 
@@ -128,6 +199,64 @@
   void copyBB(ScopStmt &Stmt, BasicBlock *BB, BasicBlock *BBCopy,
               ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT &LTS);
 
+  /// @brief Return the alloca for @p ScalarBase in @p Map.
+  ///
+  /// If no alloca was mapped to @p ScalarBase in @p Map a new one is created
+  /// and named after @p ScalarBase with the suffix @p NameExt.
+  ///
+  /// @param ScalarBase The demoted scalar instruction.
+  /// @param Map        The map we should look for a mapped alloca instruction.
+  /// @param NameExt    The suffix we add to the name of a new created alloca.
+  /// @param IsNew      If set it will hold true iff the alloca was created.
+  ///
+  /// @returns The alloca for @p ScalarBase in @p Map.
+  AllocaInst *getOrCreateAlloca(Instruction *ScalarBase, ScalarAllocaMapTy &Map,
+                                const char *NameExt = ".s2a",
+                                bool *IsNew = nullptr);
+
+  /// @brief Generate reload of scalars demoted to memory and needed by @p Inst.
+  ///
+  /// @param Stmt  The statement we generate code for.
+  /// @param Inst  The instruction that might need reloaded values.
+  /// @param BBMap A mapping from old values to their new values in this block.
+  virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst,
+                                   ValueMapT &BBMap);
+
+  /// @brief Generate the scalar stores for the given statement.
+  ///
+  /// After the statement @p Stmt was copied all inner-SCoP scalar dependences
+  /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to
+  /// be demoted to memory.
+  ///
+  /// @param Stmt  The statement we generate code for.
+  /// @param BB    The basic block we generate code for.
+  /// @param BBMap A mapping from old values to their new values in this block.
+  /// @param GlobalMap A mapping for globally replaced values.
+  virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB,
+                                    ValueMapT &BBMAp, ValueMapT &GlobalMap);
+
+  /// @brief Handle users of @p Inst outside the SCoP.
+  ///
+  /// @param R        The current SCoP region.
+  /// @param Inst     The current instruction we check.
+  /// @param InstCopy The copy of the instruction @p Inst in the optimized SCoP.
+  void handleOutsideUsers(const Region &R, Instruction *Inst, Value *InstCopy);
+
+  /// @brief Initialize the memory of demoted scalars.
+  ///
+  /// If a PHI node was demoted and one of its predecessor blocks was outside
+  /// the SCoP we need to initialize the memory cell we demoted the PHI into
+  /// with the value corresponding to that predecessor. As a SCoP is a
+  /// __single__ entry region there is at most one such predecessor.
+  void createScalarInitialization(Region &R, ValueMapT &VMap);
+
+  /// @brief Promote the values of demoted scalars after the SCoP.
+  ///
+  /// If a scalar value was used outside the SCoP we need to promote the value
+  /// stored in the memory cell allocated for that scalar and combine it with
+  /// the original value in the non-optimized SCoP.
+  void createScalarFinalization(Region &R);
+
   /// @brief Get the new version of a value.
   ///
   /// Given an old value, we first check if a new version of this value is
@@ -183,6 +312,17 @@
                              ValueMapT &BBMap, ValueMapT &GlobalMap,
                              LoopToScevMapT &LTS);
 
+  /// @brief Copy a single PHI instruction.
+  ///
+  /// The implementation in the BlockGenerator is trivial, however it allows
+  /// subclasses to handle PHIs different.
+  ///
+  /// @returns The nullptr as the BlockGenerator does not copy PHIs.
+  virtual Value *copyPHIInstruction(ScopStmt &, const PHINode *, ValueMapT &,
+                                    ValueMapT &, LoopToScevMapT &) {
+    return nullptr;
+  }
+
   /// @brief Copy a single Instruction.
   ///
   /// This copies a single Instruction and updates references to old values
@@ -202,6 +342,22 @@
   void copyInstruction(ScopStmt &Stmt, const Instruction *Inst,
                        ValueMapT &BBMap, ValueMapT &GlobalMap,
                        LoopToScevMapT &LTS);
+
+  /// @brief Helper to get the newest version of @p ScalarValue.
+  ///
+  /// @param ScalarValue The original value needed.
+  /// @param R           The current SCoP region.
+  /// @param ReloadMap   The scalar map for demoted values.
+  /// @param BBMap       A mapping from old values to their new values
+  ///                    (for values recalculated within this basic block).
+  /// @param GlobalMap   A mapping from old values to their new values
+  ///                    (for values recalculated in the new ScoP, but not
+  ///                    within this basic block).
+  ///
+  /// @returns The newest version (e.g., reloaded) of the scalar value.
+  Value *getNewScalarValue(Value *ScalarValue, const Region &R,
+                           ScalarAllocaMapTy &ReloadMap, ValueMapT &BBMap,
+                           ValueMapT &GlobalMap);
 };
 
 /// @brief Generate a new vector basic block for a polyhedral statement.
@@ -374,12 +530,82 @@
   /// @param LTS       A map from old loops to new induction variables as SCEVs.
   void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT &LTS);
 
+  /// @brief An empty destructor
+  virtual ~RegionGenerator(){};
+
 private:
+  /// @brief A map from old to new blocks in the region.
+  DenseMap<BasicBlock *, BasicBlock *> BlockMap;
+
+  /// @brief The "BBMaps" for the whole region (one for each block).
+  DenseMap<BasicBlock *, ValueMapT> RegionMaps;
+
+  /// @brief Mapping to remember PHI nodes that still need incoming values.
+  using PHINodePairTy = std::pair<const PHINode *, PHINode *>;
+  DenseMap<BasicBlock *, SmallVector<PHINodePairTy, 4>> IncompletePHINodeMap;
+
   /// @brief Repair the dominance tree after we created a copy block for @p BB.
   ///
   /// @returns The immediate dominator in the DT for @p BBCopy if in the region.
-  BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy,
-                              DenseMap<BasicBlock *, BasicBlock *> &BlockMap);
+  BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy);
+
+  /// @brief Add the new operand from the copy of @p IncomingBB to @p PHICopy.
+  ///
+  /// @param Stmt       The statement to code generate.
+  /// @param PHI        The original PHI we copy.
+  /// @param PHICopy    The copy of @p PHI.
+  /// @param IncomingBB An incoming block of @p PHI.
+  /// @param GlobalMap  A mapping from old values to their new values
+  ///                   (for values recalculated in the new ScoP, but not
+  ///                   within this basic block).
+  /// @param LTS        A map from old loops to new induction variables as
+  /// SCEVs.
+  void addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI, PHINode *PHICopy,
+                       BasicBlock *IncomingBB, ValueMapT &GlobalMap,
+                       LoopToScevMapT &LTS);
+
+  /// @brief Generate reload of scalars demoted to memory and needed by @p Inst.
+  ///
+  /// @param Stmt  The statement we generate code for.
+  /// @param Inst  The instruction that might need reloaded values.
+  /// @param BBMap A mapping from old values to their new values in this block.
+  virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst,
+                                   ValueMapT &BBMap) override;
+
+  /// @brief Generate the scalar stores for the given statement.
+  ///
+  /// After the statement @p Stmt was copied all inner-SCoP scalar dependences
+  /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to
+  /// be demoted to memory.
+  ///
+  /// @param Stmt  The statement we generate code for.
+  /// @param BB    The basic block we generate code for.
+  /// @param BBMap A mapping from old values to their new values in this block.
+  /// @param GlobalMap  A mapping from old values to their new values
+  ///                   (for values recalculated in the new ScoP, but not
+  ///                   within this basic block).
+  virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB,
+                                    ValueMapT &BBMAp,
+                                    ValueMapT &GlobalMap) override;
+
+  /// @brief Copy a single PHI instruction.
+  ///
+  /// This copies a single PHI instruction and updates references to old values
+  /// with references to new values, as defined by GlobalMap and BBMap.
+  ///
+  /// @param Stmt      The statement to code generate.
+  /// @param PHI       The PHI instruction to copy.
+  /// @param BBMap     A mapping from old values to their new values
+  ///                  (for values recalculated within this basic block).
+  /// @param GlobalMap A mapping from old values to their new values
+  ///                  (for values recalculated in the new ScoP, but not
+  ///                  within this basic block).
+  /// @param LTS       A map from old loops to new induction variables as SCEVs.
+  ///
+  /// @returns The copied instruction or nullptr if no copy was made.
+  virtual Value *copyPHIInstruction(ScopStmt &Stmt, const PHINode *Inst,
+                                    ValueMapT &BBMap, ValueMapT &GlobalMap,
+                                    LoopToScevMapT &LTS) override;
 };
 }
 #endif
Index: polly/trunk/include/polly/CodeGen/IslNodeBuilder.h
===================================================================
--- polly/trunk/include/polly/CodeGen/IslNodeBuilder.h
+++ polly/trunk/include/polly/CodeGen/IslNodeBuilder.h
@@ -32,13 +32,20 @@
                  DominatorTree &DT, Scop &S)
       : S(S), Builder(Builder), Annotator(Annotator), Rewriter(SE, DL, "polly"),
         ExprBuilder(Builder, IDToValue, Rewriter, DT, LI),
-        BlockGen(Builder, LI, SE, DT, &ExprBuilder), RegionGen(BlockGen), P(P),
-        DL(DL), LI(LI), SE(SE), DT(DT) {}
+        BlockGen(Builder, LI, SE, DT, ScalarMap, PHIOpMap, EscapeMap,
+                 &ExprBuilder),
+        RegionGen(BlockGen), P(P), DL(DL), LI(LI), SE(SE), DT(DT) {}
 
   ~IslNodeBuilder() {}
 
   void addParameters(__isl_take isl_set *Context);
   void create(__isl_take isl_ast_node *Node);
+
+  /// @brief Finalize code generation for the SCoP @p S.
+  ///
+  /// @see BlockGenerator::finalizeSCoP(Scop &S)
+  void finalizeSCoP(Scop &S) { BlockGen.finalizeSCoP(S, ValueMap); }
+
   IslExprBuilder &getExprBuilder() { return ExprBuilder; }
 
 private:
@@ -50,9 +57,26 @@
   SCEVExpander Rewriter;
 
   IslExprBuilder ExprBuilder;
+
+  /// @brief Maps used by the block and region generator to demote scalars.
+  ///
+  ///@{
+
+  /// @brief See BlockGenerator::ScalarMap.
+  BlockGenerator::ScalarAllocaMapTy ScalarMap;
+
+  /// @brief See BlockGenerator::PhiOpMap.
+  BlockGenerator::ScalarAllocaMapTy PHIOpMap;
+
+  /// @brief See BlockGenerator::EscapeMap.
+  BlockGenerator::EscapeUsersAllocaMapTy EscapeMap;
+
+  ///@}
+
+  /// @brief The generator used to copy a basic block.
   BlockGenerator BlockGen;
 
-  /// @brief Generator for region statements.
+  /// @brief The generator used to copy a non-affine region.
   RegionGenerator RegionGen;
 
   Pass *const P;
Index: polly/trunk/include/polly/ScopInfo.h
===================================================================
--- polly/trunk/include/polly/ScopInfo.h
+++ polly/trunk/include/polly/ScopInfo.h
@@ -25,6 +25,8 @@
 #include "llvm/Analysis/RegionPass.h"
 #include "isl/ctx.h"
 
+#include <forward_list>
+
 using namespace llvm;
 
 namespace llvm {
@@ -410,7 +412,11 @@
 /// accesses.
 /// At the moment every statement represents a single basic block of LLVM-IR.
 class ScopStmt {
-  //===-------------------------------------------------------------------===//
+public:
+  /// @brief List to hold all (scalar) memory accesses mapped to an instruction.
+  using MemoryAccessList = std::forward_list<MemoryAccess>;
+
+private:
   ScopStmt(const ScopStmt &) = delete;
   const ScopStmt &operator=(const ScopStmt &) = delete;
 
@@ -476,7 +482,9 @@
   /// The only side effects of a statement are its memory accesses.
   typedef SmallVector<MemoryAccess *, 8> MemoryAccessVec;
   MemoryAccessVec MemAccs;
-  std::map<const Instruction *, MemoryAccess *> InstructionToAccess;
+
+  /// @brief Mapping from instructions to (scalar) memory accesses.
+  DenseMap<const Instruction *, MemoryAccessList *> InstructionToAccess;
 
   //@}
 
@@ -628,16 +636,31 @@
   /// @brief Return true if this statement represents a whole region.
   bool isRegionStmt() const { return R != nullptr; }
 
+  /// @brief Return the (scalar) memory accesses for @p Inst.
+  const MemoryAccessList &getAccessesFor(const Instruction *Inst) const {
+    MemoryAccessList *MAL = lookupAccessesFor(Inst);
+    assert(MAL && "Cannot get memory accesses because they do not exist!");
+    return *MAL;
+  }
+
+  /// @brief Return the (scalar) memory accesses for @p Inst if any.
+  MemoryAccessList *lookupAccessesFor(const Instruction *Inst) const {
+    auto It = InstructionToAccess.find(Inst);
+    return It == InstructionToAccess.end() ? nullptr : It->getSecond();
+  }
+
+  /// @brief Return the __first__ (scalar) memory access for @p Inst.
   const MemoryAccess &getAccessFor(const Instruction *Inst) const {
-    MemoryAccess *A = lookupAccessFor(Inst);
-    assert(A && "Cannot get memory access because it does not exist!");
-    return *A;
+    MemoryAccess *MA = lookupAccessFor(Inst);
+    assert(MA && "Cannot get memory access because it does not exist!");
+    return *MA;
   }
 
+  /// @brief Return the __first__ (scalar) memory access for @p Inst if any.
   MemoryAccess *lookupAccessFor(const Instruction *Inst) const {
-    std::map<const Instruction *, MemoryAccess *>::const_iterator at =
-        InstructionToAccess.find(Inst);
-    return at == InstructionToAccess.end() ? NULL : at->second;
+    auto It = InstructionToAccess.find(Inst);
+    return It == InstructionToAccess.end() ? nullptr
+                                           : &It->getSecond()->front();
   }
 
   void setBasicBlock(BasicBlock *Block) {
Index: polly/trunk/lib/Analysis/ScopInfo.cpp
===================================================================
--- polly/trunk/lib/Analysis/ScopInfo.cpp
+++ polly/trunk/lib/Analysis/ScopInfo.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -877,19 +878,11 @@
     if (isApproximated && Access.isWrite())
       Access.setMayWrite();
 
-    MemAccs.push_back(
-        new MemoryAccess(Access, AccessInst, this, SAI, MemAccs.size()));
-
-    // We do not track locations for scalar memory accesses at the moment.
-    //
-    // We do not have a use for this information at the moment. If we need this
-    // at some point, the "instruction -> access" mapping needs to be enhanced
-    // as a single instruction could then possibly perform multiple accesses.
-    if (!Access.isScalar()) {
-      assert(!InstructionToAccess.count(AccessInst) &&
-             "Unexpected 1-to-N mapping on instruction to access map!");
-      InstructionToAccess[AccessInst] = MemAccs.back();
-    }
+    MemoryAccessList *&MAL = InstructionToAccess[AccessInst];
+    if (!MAL)
+      MAL = new MemoryAccessList();
+    MAL->emplace_front(Access, AccessInst, this, SAI, MemAccs.size());
+    MemAccs.push_back(&MAL->front());
   }
 }
 
@@ -1258,11 +1251,7 @@
 }
 
 ScopStmt::~ScopStmt() {
-  while (!MemAccs.empty()) {
-    delete MemAccs.back();
-    MemAccs.pop_back();
-  }
-
+  DeleteContainerSeconds(InstructionToAccess);
   isl_set_free(Domain);
   isl_map_free(Schedule);
 }
Index: polly/trunk/lib/CodeGen/BlockGenerators.cpp
===================================================================
--- polly/trunk/lib/CodeGen/BlockGenerators.cpp
+++ polly/trunk/lib/CodeGen/BlockGenerators.cpp
@@ -81,8 +81,13 @@
 
 BlockGenerator::BlockGenerator(PollyIRBuilder &B, LoopInfo &LI,
                                ScalarEvolution &SE, DominatorTree &DT,
+                               ScalarAllocaMapTy &ScalarMap,
+                               ScalarAllocaMapTy &PHIOpMap,
+                               EscapeUsersAllocaMapTy &EscapeMap,
                                IslExprBuilder *ExprBuilder)
-    : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT) {}
+    : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT),
+      EntryBB(nullptr), PHIOpMap(PHIOpMap), ScalarMap(ScalarMap),
+      EscapeMap(EscapeMap) {}
 
 Value *BlockGenerator::getNewValue(ScopStmt &Stmt, const Value *Old,
                                    ValueMapT &BBMap, ValueMapT &GlobalMap,
@@ -242,13 +247,22 @@
 void BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst,
                                      ValueMapT &BBMap, ValueMapT &GlobalMap,
                                      LoopToScevMapT &LTS) {
+
+  // First check for possible scalar dependences for this instruction.
+  generateScalarLoads(Stmt, Inst, BBMap);
+
   // Terminator instructions control the control flow. They are explicitly
   // expressed in the clast and do not need to be copied.
   if (Inst->isTerminator())
     return;
 
-  if (canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion()))
+  Loop *L = getLoopForInst(Inst);
+  if ((Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) &&
+      canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) {
+    Value *NewValue = getNewValue(Stmt, Inst, BBMap, GlobalMap, LTS, L);
+    BBMap[Inst] = NewValue;
     return;
+  }
 
   if (const LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
     Value *NewLoad = generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS);
@@ -266,6 +280,11 @@
     return;
   }
 
+  if (const PHINode *PHI = dyn_cast<PHINode>(Inst)) {
+    copyPHIInstruction(Stmt, PHI, BBMap, GlobalMap, LTS);
+    return;
+  }
+
   // Skip some special intrinsics for which we do not adjust the semantics to
   // the new schedule. All others are handled like every other instruction.
   if (auto *IT = dyn_cast<IntrinsicInst>(Inst)) {
@@ -323,8 +342,329 @@
                             ValueMapT &BBMap, ValueMapT &GlobalMap,
                             LoopToScevMapT &LTS) {
   Builder.SetInsertPoint(CopyBB->begin());
+  EntryBB = &CopyBB->getParent()->getEntryBlock();
+
   for (Instruction &Inst : *BB)
     copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS);
+
+  // After a basic block was copied store all scalars that escape this block
+  // in their alloca. First the scalars that have dependences inside the SCoP,
+  // then the ones that might escape the SCoP.
+  generateScalarStores(Stmt, BB, BBMap, GlobalMap);
+
+  const Region &R = Stmt.getParent()->getRegion();
+  for (Instruction &Inst : *BB)
+    handleOutsideUsers(R, &Inst, BBMap[&Inst]);
+}
+
+AllocaInst *BlockGenerator::getOrCreateAlloca(Instruction *ScalarBase,
+                                              ScalarAllocaMapTy &Map,
+                                              const char *NameExt,
+                                              bool *IsNew) {
+
+  // Check if an alloca was cached for the base instruction.
+  AllocaInst *&Addr = Map[ScalarBase];
+
+  // If needed indicate if it was found already or will be created.
+  if (IsNew)
+    *IsNew = (Addr == nullptr);
+
+  // If no alloca was found create one and insert it in the entry block.
+  if (!Addr) {
+    auto *Ty = ScalarBase->getType();
+    Addr = new AllocaInst(Ty, ScalarBase->getName() + NameExt);
+    Addr->insertBefore(EntryBB->getFirstInsertionPt());
+  }
+
+  return Addr;
+}
+
+void BlockGenerator::handleOutsideUsers(const Region &R, Instruction *Inst,
+                                        Value *InstCopy) {
+  BasicBlock *ExitBB = R.getExit();
+
+  EscapeUserVectorTy EscapeUsers;
+  for (User *U : Inst->users()) {
+
+    // Non-instruction user will never escape.
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (!UI)
+      continue;
+
+    if (R.contains(UI) && ExitBB != UI->getParent())
+      continue;
+
+    EscapeUsers.push_back(UI);
+  }
+
+  // Exit if no escape uses were found.
+  if (EscapeUsers.empty())
+    return;
+
+  // If there are escape users we get the alloca for this instruction and put
+  // it in the EscapeMap for later finalization. However, if the alloca was not
+  // created by an already handled scalar dependence we have to initialize it
+  // also. Lastly, if the instruction was copied multiple times we already did
+  // this and can exit.
+  if (EscapeMap.count(Inst))
+    return;
+
+  // Get or create an escape alloca for this instruction.
+  bool IsNew;
+  AllocaInst *ScalarAddr =
+      getOrCreateAlloca(Inst, ScalarMap, ".escape", &IsNew);
+
+  // Remember that this instruction has escape uses and the escape alloca.
+  EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers));
+
+  // If the escape alloca was just created store the instruction in there,
+  // otherwise that happened already.
+  if (IsNew) {
+    assert(InstCopy && "Except PHIs every instruction should have a copy!");
+    Builder.CreateStore(InstCopy, ScalarAddr);
+  }
+}
+
+void BlockGenerator::generateScalarLoads(ScopStmt &Stmt,
+                                         const Instruction *Inst,
+                                         ValueMapT &BBMap) {
+
+  // Iterate over all memory accesses for the given instruction and handle all
+  // scalar reads.
+  if (ScopStmt::MemoryAccessList *MAL = Stmt.lookupAccessesFor(Inst)) {
+    for (MemoryAccess &MA : *MAL) {
+      if (!MA.isScalar() || !MA.isRead())
+        continue;
+
+      Instruction *ScalarBase = cast<Instruction>(MA.getBaseAddr());
+      Instruction *ScalarInst = MA.getAccessInstruction();
+
+      PHINode *ScalarBasePHI = dyn_cast<PHINode>(ScalarBase);
+
+      // This is either a common scalar use (second case) or the use of a phi
+      // operand by the PHI node (first case).
+      if (ScalarBasePHI == ScalarInst) {
+        AllocaInst *PHIOpAddr =
+            getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops");
+        LoadInst *LI =
+            Builder.CreateLoad(PHIOpAddr, PHIOpAddr->getName() + ".reload");
+        BBMap[ScalarBase] = LI;
+      } else {
+        // For non-PHI operand uses we look up the alloca in the ScalarMap,
+        // reload it and add the mapping to the ones in the current basic block.
+        AllocaInst *ScalarAddr =
+            getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+        LoadInst *LI =
+            Builder.CreateLoad(ScalarAddr, ScalarAddr->getName() + ".reload");
+        BBMap[ScalarBase] = LI;
+      }
+    }
+  }
+}
+
+Value *BlockGenerator::getNewScalarValue(Value *ScalarValue, const Region &R,
+                                         ScalarAllocaMapTy &ReloadMap,
+                                         ValueMapT &BBMap,
+                                         ValueMapT &GlobalMap) {
+  // If the value we want to store is an instruction we might have demoted it
+  // in order to make it accessible here. In such a case a reload is
+  // necessary. If it is no instruction it will always be a value that
+  // dominates the current point and we can just use it. In total there are 4
+  // options:
+  //  (1) The value is no instruction ==> use the value.
+  //  (2) The value is an instruction that was split out of the region prior to
+  //      code generation  ==> use the instruction as it dominates the region.
+  //  (3) The value is an instruction:
+  //      (a) The value was defined in the current block, thus a copy is in
+  //          the BBMap ==> use the mapped value.
+  //      (b) The value was defined in a previous block, thus we demoted it
+  //          earlier ==> use the reloaded value.
+  Instruction *ScalarValueInst = dyn_cast<Instruction>(ScalarValue);
+  if (!ScalarValueInst)
+    return ScalarValue;
+
+  if (!R.contains(ScalarValueInst)) {
+    if (Value *ScalarValueCopy = GlobalMap.lookup(ScalarValueInst))
+      return /* Case (3a) */ ScalarValueCopy;
+    else
+      return /* Case 2 */ ScalarValue;
+  }
+
+  if (Value *ScalarValueCopy = BBMap.lookup(ScalarValueInst))
+    return /* Case (3a) */ ScalarValueCopy;
+
+  // Case (3b)
+  assert(ReloadMap.count(ScalarValueInst) &&
+         "ScalarInst not mapped in the block and not in the given reload map!");
+  Value *ReloadAddr = ReloadMap[ScalarValueInst];
+  ScalarValue =
+      Builder.CreateLoad(ReloadAddr, ReloadAddr->getName() + ".reload");
+
+  return ScalarValue;
+}
+
+void BlockGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB,
+                                          ValueMapT &BBMap,
+                                          ValueMapT &GlobalMap) {
+  const Region &R = Stmt.getParent()->getRegion();
+
+  assert(Stmt.isBlockStmt() && BB == Stmt.getBasicBlock() &&
+         "Region statements need to use the generateScalarStores() "
+         "function in the RegionGenerator");
+
+  // Set to remember a store to the phiops alloca of a PHINode. It is needed as
+  // we might have multiple write accesses to the same PHI and while one is the
+  // self write of the PHI (to the ScalarMap alloca) the other is the write to
+  // the operand alloca (PHIOpMap).
+  SmallPtrSet<PHINode *, 4> SeenPHIs;
+
+  // Iterate over all accesses in the given statement.
+  for (MemoryAccess *MA : Stmt) {
+
+    // Skip non-scalar and read accesses.
+    if (!MA->isScalar() || MA->isRead())
+      continue;
+
+    Instruction *ScalarBase = cast<Instruction>(MA->getBaseAddr());
+    Instruction *ScalarInst = MA->getAccessInstruction();
+    PHINode *ScalarBasePHI = dyn_cast<PHINode>(ScalarBase);
+
+    // Get the alloca node for the base instruction and the value we want to
+    // store. In total there are 4 options:
+    //  (1) The base is no PHI, hence it is a simple scalar def-use chain.
+    //  (2) The base is a PHI,
+    //      (a) and the write is caused by an operand in the block.
+    //      (b) and it is the PHI self write (same as case (1)).
+    //      (c) (2a) and (2b) are not distinguishable.
+    // For case (1) and (2b) we get the alloca from the scalar map and the value
+    // we want to store is initialized with the instruction attached to the
+    // memory access. For case (2a) we get the alloca from the PHI operand map
+    // and the value we want to store is initialized with the incoming value for
+    // this block. The tricky case (2c) is when both (2a) and (2b) match. This
+    // happens if the PHI operand is in the same block as the PHI. To handle
+    // that we choose the alloca of (2a) first and (2b) for the next write
+    // access to that PHI (there must be 2).
+    Value *ScalarValue = nullptr;
+    AllocaInst *ScalarAddr = nullptr;
+
+    if (!ScalarBasePHI) {
+      // Case (1)
+      ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+      ScalarValue = ScalarInst;
+    } else {
+      int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB);
+      if (ScalarBasePHI != ScalarInst) {
+        // Case (2a)
+        assert(PHIIdx >= 0 && "Bad scalar write to PHI operand");
+        SeenPHIs.insert(ScalarBasePHI);
+        ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops");
+        ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx);
+      } else if (PHIIdx < 0) {
+        // Case (2b)
+        ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+        ScalarValue = ScalarInst;
+      } else {
+        // Case (2c)
+        if (SeenPHIs.insert(ScalarBasePHI).second) {
+          // First access ==> same as (2a)
+          ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops");
+          ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx);
+        } else {
+          // Second access ==> same as (2b)
+          ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+          ScalarValue = ScalarInst;
+        }
+      }
+    }
+
+    ScalarValue =
+        getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap);
+    Builder.CreateStore(ScalarValue, ScalarAddr);
+  }
+}
+
+void BlockGenerator::createScalarInitialization(Region &R,
+                                                ValueMapT &GlobalMap) {
+  // The split block __just before__ the region and optimized region.
+  BasicBlock *SplitBB = R.getEnteringBlock();
+  BranchInst *SplitBBTerm = cast<BranchInst>(SplitBB->getTerminator());
+  assert(SplitBBTerm->getNumSuccessors() == 2 && "Bad region entering block!");
+
+  // Get the start block of the __optimized__ region.
+  BasicBlock *StartBB = SplitBBTerm->getSuccessor(0);
+  if (StartBB == R.getEntry())
+    StartBB = SplitBBTerm->getSuccessor(1);
+
+  // For each PHI predecessor outside the region store the incoming operand
+  // value prior to entering the optimized region.
+  Builder.SetInsertPoint(StartBB->getTerminator());
+
+  ScalarAllocaMapTy EmptyMap;
+  for (const auto &PHIOpMapping : PHIOpMap) {
+    const PHINode *PHI = cast<PHINode>(PHIOpMapping.getFirst());
+
+    // Check if this PHI has the split block as predecessor (that is the only
+    // possible predecessor outside the SCoP).
+    int idx = PHI->getBasicBlockIndex(SplitBB);
+    if (idx < 0)
+      continue;
+
+    Value *ScalarValue = PHI->getIncomingValue(idx);
+    ScalarValue =
+        getNewScalarValue(ScalarValue, R, EmptyMap, GlobalMap, GlobalMap);
+
+    // If the split block is the predecessor initialize the PHI operator alloca.
+    Builder.CreateStore(ScalarValue, PHIOpMapping.getSecond());
+  }
+}
+
+void BlockGenerator::createScalarFinalization(Region &R) {
+  // The exit block of the __unoptimized__ region.
+  BasicBlock *ExitBB = R.getExitingBlock();
+  // The merge block __just after__ the region and the optimized region.
+  BasicBlock *MergeBB = R.getExit();
+
+  // The exit block of the __optimized__ region.
+  BasicBlock *OptExitBB = *(pred_begin(MergeBB));
+  if (OptExitBB == ExitBB)
+    OptExitBB = *(++pred_begin(MergeBB));
+
+  Builder.SetInsertPoint(OptExitBB->getTerminator());
+  for (const auto &EscapeMapping : EscapeMap) {
+    // Extract the escaping instruction and the escaping users as well as the
+    // alloca the instruction was demoted to.
+    Instruction *EscapeInst = EscapeMapping.getFirst();
+    const auto &EscapeMappingValue = EscapeMapping.getSecond();
+    const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second;
+    AllocaInst *ScalarAddr = EscapeMappingValue.first;
+
+    // Reload the demoted instruction in the optimized version of the SCoP.
+    Instruction *EscapeInstReload =
+        Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload");
+
+    // Create the merge PHI that merges the optimized and unoptimized version.
+    PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2,
+                                        EscapeInst->getName() + ".merge");
+    MergePHI->insertBefore(MergeBB->getFirstInsertionPt());
+
+    // Add the respective values to the merge PHI.
+    MergePHI->addIncoming(EscapeInstReload, OptExitBB);
+    MergePHI->addIncoming(EscapeInst, ExitBB);
+
+    // The information of scalar evolution about the escaping instruction needs
+    // to be revoked so the new merged instruction will be used.
+    if (SE.isSCEVable(EscapeInst->getType()))
+      SE.forgetValue(EscapeInst);
+
+    // Replace all uses of the demoted instruction with the merge PHI.
+    for (Instruction *EUser : EscapeUsers)
+      EUser->replaceUsesOfWith(EscapeInst, MergePHI);
+  }
+}
+
+void BlockGenerator::finalizeSCoP(Scop &S, ValueMapT &GlobalMap) {
+  createScalarInitialization(S.getRegion(), GlobalMap);
+  createScalarFinalization(S.getRegion());
 }
 
 VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen,
@@ -679,9 +1019,8 @@
     copyInstruction(Stmt, &Inst, VectorBlockMap, ScalarBlockMap);
 }
 
-BasicBlock *RegionGenerator::repairDominance(
-    BasicBlock *BB, BasicBlock *BBCopy,
-    DenseMap<BasicBlock *, BasicBlock *> &BlockMap) {
+BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB,
+                                             BasicBlock *BBCopy) {
 
   BasicBlock *BBIDom = DT.getNode(BB)->getIDom()->getBlock();
   BasicBlock *BBCopyIDom = BlockMap.lookup(BBIDom);
@@ -697,20 +1036,31 @@
   assert(Stmt.isRegionStmt() &&
          "Only region statements can be copied by the block generator");
 
+  // Forget all old mappings.
+  BlockMap.clear();
+  RegionMaps.clear();
+  IncompletePHINodeMap.clear();
+
   // The region represented by the statement.
   Region *R = Stmt.getRegion();
 
-  // The "BBMaps" for the whole region.
-  DenseMap<BasicBlock *, ValueMapT> RegionMaps;
+  // Create a dedicated entry for the region where we can reload all demoted
+  // inputs.
+  BasicBlock *EntryBB = R->getEntry();
+  BasicBlock *EntryBBCopy =
+      SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI);
+  EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry");
+  Builder.SetInsertPoint(EntryBBCopy->begin());
 
-  // A map from old to new blocks in the region
-  DenseMap<BasicBlock *, BasicBlock *> BlockMap;
+  for (auto PI = pred_begin(EntryBB), PE = pred_end(EntryBB); PI != PE; ++PI)
+    if (!R->contains(*PI))
+      BlockMap[*PI] = EntryBBCopy;
 
   // Iterate over all blocks in the region in a breadth-first search.
   std::deque<BasicBlock *> Blocks;
   SmallPtrSet<BasicBlock *, 8> SeenBlocks;
-  Blocks.push_back(R->getEntry());
-  SeenBlocks.insert(R->getEntry());
+  Blocks.push_back(EntryBB);
+  SeenBlocks.insert(EntryBB);
 
   while (!Blocks.empty()) {
     BasicBlock *BB = Blocks.front();
@@ -718,7 +1068,10 @@
 
     // First split the block and update dominance information.
     BasicBlock *BBCopy = splitBB(BB);
-    BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy, BlockMap);
+    BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy);
+
+    // In order to remap PHI nodes we store also basic block mappings.
+    BlockMap[BB] = BBCopy;
 
     // Get the mapping for this block and initialize it with the mapping
     // available at its immediate dominator (in the new region).
@@ -728,22 +1081,28 @@
     // Copy the block with the BlockGenerator.
     copyBB(Stmt, BB, BBCopy, RegionMap, GlobalMap, LTS);
 
+    // In order to remap PHI nodes we store also basic block mappings.
+    BlockMap[BB] = BBCopy;
+
+    // Add values to incomplete PHI nodes waiting for this block to be copied.
+    for (const PHINodePairTy &PHINodePair : IncompletePHINodeMap[BB])
+      addOperandToPHI(Stmt, PHINodePair.first, PHINodePair.second, BB,
+                      GlobalMap, LTS);
+    IncompletePHINodeMap[BB].clear();
+
     // And continue with new successors inside the region.
     for (auto SI = succ_begin(BB), SE = succ_end(BB); SI != SE; SI++)
       if (R->contains(*SI) && SeenBlocks.insert(*SI).second)
         Blocks.push_back(*SI);
-
-    // In order to remap PHI nodes we store also basic block mappings.
-    BlockMap[BB] = BBCopy;
   }
 
   // Now create a new dedicated region exit block and add it to the region map.
   BasicBlock *ExitBBCopy =
       SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI);
-  ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".as.exit");
+  ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit");
   BlockMap[R->getExit()] = ExitBBCopy;
 
-  repairDominance(R->getExit(), ExitBBCopy, BlockMap);
+  repairDominance(R->getExit(), ExitBBCopy);
 
   // As the block generator doesn't handle control flow we need to add the
   // region control flow by hand after all blocks have been copied.
@@ -762,6 +1121,178 @@
     BICopy->eraseFromParent();
   }
 
+  // Add counting PHI nodes to all loops in the region that can be used as
+  // replacement for SCEVs refering to the old loop.
+  for (BasicBlock *BB : SeenBlocks) {
+    Loop *L = LI.getLoopFor(BB);
+    if (L == nullptr || L->getHeader() != BB)
+      continue;
+
+    BasicBlock *BBCopy = BlockMap[BB];
+    Value *NullVal = Builder.getInt32(0);
+    PHINode *LoopPHI =
+        PHINode::Create(Builder.getInt32Ty(), 2, "polly.subregion.iv");
+    Instruction *LoopPHIInc = BinaryOperator::CreateAdd(
+        LoopPHI, Builder.getInt32(1), "polly.subregion.iv.inc");
+    LoopPHI->insertBefore(BBCopy->begin());
+    LoopPHIInc->insertBefore(BBCopy->getTerminator());
+
+    for (auto *PredBB : make_range(pred_begin(BB), pred_end(BB))) {
+      if (!R->contains(PredBB))
+        continue;
+      if (L->contains(PredBB))
+        LoopPHI->addIncoming(LoopPHIInc, BlockMap[PredBB]);
+      else
+        LoopPHI->addIncoming(NullVal, BlockMap[PredBB]);
+    }
+
+    for (auto *PredBBCopy : make_range(pred_begin(BBCopy), pred_end(BBCopy)))
+      if (LoopPHI->getBasicBlockIndex(PredBBCopy) < 0)
+        LoopPHI->addIncoming(NullVal, PredBBCopy);
+
+    LTS[L] = SE.getUnknown(LoopPHI);
+  }
+
+  // Add all mappings from the region to the global map so outside uses will use
+  // the copied instructions.
+  for (auto &BBMap : RegionMaps)
+    GlobalMap.insert(BBMap.second.begin(), BBMap.second.end());
+
   // Reset the old insert point for the build.
   Builder.SetInsertPoint(ExitBBCopy->begin());
 }
+
+void RegionGenerator::generateScalarLoads(ScopStmt &Stmt,
+                                          const Instruction *Inst,
+                                          ValueMapT &BBMap) {
+
+  // Inside a non-affine region PHI nodes are copied not demoted. Once the
+  // phi is copied it will reload all inputs from outside the region, hence
+  // we do not need to generate code for the read access of the operands of a
+  // PHI.
+  if (isa<PHINode>(Inst))
+    return;
+
+  return BlockGenerator::generateScalarLoads(Stmt, Inst, BBMap);
+}
+
+void RegionGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB,
+                                           ValueMapT &BBMap,
+                                           ValueMapT &GlobalMap) {
+  const Region &R = Stmt.getParent()->getRegion();
+
+  Region *StmtR = Stmt.getRegion();
+  assert(StmtR && "Block statements need to use the generateScalarStores() "
+                  "function in the BlockGenerator");
+
+  BasicBlock *ExitBB = StmtR->getExit();
+
+  // For region statements three kinds of scalar stores exists:
+  //  (1) A definition used by a non-phi instruction outside the region.
+  //  (2) A phi-instruction in the region entry.
+  //  (3) A write to a phi instruction in the region exit.
+  // The last case is the tricky one since we do not know anymore which
+  // predecessor of the exit needs to store the operand value that doesn't
+  // have a definition in the region. Therefore, we have to check in each
+  // block in the region if we should store the value or not.
+
+  // Iterate over all accesses in the given statement.
+  for (MemoryAccess *MA : Stmt) {
+
+    // Skip non-scalar and read accesses.
+    if (!MA->isScalar() || MA->isRead())
+      continue;
+
+    Instruction *ScalarBase = cast<Instruction>(MA->getBaseAddr());
+    Instruction *ScalarInst = MA->getAccessInstruction();
+    PHINode *ScalarBasePHI = dyn_cast<PHINode>(ScalarBase);
+
+    Value *ScalarValue = nullptr;
+    AllocaInst *ScalarAddr = nullptr;
+
+    if (!ScalarBasePHI) {
+      // Case (1)
+      ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+      ScalarValue = ScalarInst;
+    } else if (ScalarBasePHI->getParent() != ExitBB) {
+      // Case (2)
+      assert(ScalarBasePHI->getParent() == StmtR->getEntry() &&
+             "Bad PHI self write in non-affine region");
+      assert(ScalarBase == ScalarInst &&
+             "Bad PHI self write in non-affine region");
+      ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a");
+      ScalarValue = ScalarInst;
+    } else {
+      int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB);
+      // Skip accesses we will not handle in this basic block but in another one
+      // in the statement region.
+      if (PHIIdx < 0)
+        continue;
+
+      // Case (3)
+      ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops");
+      ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx);
+    }
+
+    ScalarValue =
+        getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap);
+    Builder.CreateStore(ScalarValue, ScalarAddr);
+  }
+}
+
+void RegionGenerator::addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI,
+                                      PHINode *PHICopy, BasicBlock *IncomingBB,
+                                      ValueMapT &GlobalMap,
+                                      LoopToScevMapT &LTS) {
+  Region *StmtR = Stmt.getRegion();
+
+  // If the incoming block was not yet copied mark this PHI as incomplete.
+  // Once the block will be copied the incoming value will be added.
+  BasicBlock *BBCopy = BlockMap[IncomingBB];
+  if (!BBCopy) {
+    assert(StmtR->contains(IncomingBB) &&
+           "Bad incoming block for PHI in non-affine region");
+    IncompletePHINodeMap[IncomingBB].push_back(std::make_pair(PHI, PHICopy));
+    return;
+  }
+
+  Value *OpCopy = nullptr;
+  if (StmtR->contains(IncomingBB)) {
+    assert(RegionMaps.count(BBCopy) &&
+           "Incoming PHI block did not have a BBMap");
+    ValueMapT &BBCopyMap = RegionMaps[BBCopy];
+
+    Value *Op = PHI->getIncomingValueForBlock(IncomingBB);
+    OpCopy =
+        getNewValue(Stmt, Op, BBCopyMap, GlobalMap, LTS, getLoopForInst(PHI));
+  } else {
+
+    if (PHICopy->getBasicBlockIndex(BBCopy) >= 0)
+      return;
+
+    AllocaInst *PHIOpAddr =
+        getOrCreateAlloca(const_cast<PHINode *>(PHI), PHIOpMap, ".phiops");
+    OpCopy = new LoadInst(PHIOpAddr, PHIOpAddr->getName() + ".reload",
+                          BlockMap[IncomingBB]->getTerminator());
+  }
+
+  assert(OpCopy && "Incoming PHI value was not copied properly");
+  assert(BBCopy && "Incoming PHI block was not copied properly");
+  PHICopy->addIncoming(OpCopy, BBCopy);
+}
+
+Value *RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, const PHINode *PHI,
+                                           ValueMapT &BBMap,
+                                           ValueMapT &GlobalMap,
+                                           LoopToScevMapT &LTS) {
+  unsigned NumIncoming = PHI->getNumIncomingValues();
+  PHINode *PHICopy =
+      Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName());
+  PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI());
+  BBMap[PHI] = PHICopy;
+
+  for (unsigned u = 0; u < NumIncoming; u++)
+    addOperandToPHI(Stmt, PHI, PHICopy, PHI->getIncomingBlock(u), GlobalMap,
+                    LTS);
+  return PHICopy;
+}
Index: polly/trunk/lib/CodeGen/CodeGeneration.cpp
===================================================================
--- polly/trunk/lib/CodeGen/CodeGeneration.cpp
+++ polly/trunk/lib/CodeGen/CodeGeneration.cpp
@@ -131,6 +131,8 @@
 
     NodeBuilder.create(AstRoot);
 
+    NodeBuilder.finalizeSCoP(S);
+
     assert(!verifyGeneratedFunction(S, *EnteringBB->getParent()) &&
            "Verification of generated function failed");
     return true;
Index: polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll
+++ polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll
@@ -0,0 +1,59 @@
+; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen < %s | FileCheck %s
+;
+;    void f(int *A, int c, int N) {
+;      int tmp;
+;      for (int i = 0; i < N; i++) {
+;        if (i > c)
+;          tmp = 3;
+;        else
+;          tmp = 5;
+;        A[i] = tmp;
+;      }
+;    }
+;
+; CHECK-LABEL: bb:
+; CHECK:       %tmp.0.phiops = alloca i32
+; CHECK-LABEL: polly.stmt.bb8:
+; CHECK:       %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops
+; CHECK:       store i32 %tmp.0.phiops.reload, i32*
+; CHECK-LABEL: polly.stmt.bb6:
+; CHECK:       store i32 3, i32* %tmp.0.phiops
+; CHECK-LABEL: polly.stmt.bb7:
+; CHECK:       store i32 5, i32* %tmp.0.phiops
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32 %c, i32 %N) {
+bb:
+  %tmp = sext i32 %N to i64
+  %tmp1 = sext i32 %c to i64
+  br label %bb2
+
+bb2:                                              ; preds = %bb10, %bb
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ]
+  %tmp3 = icmp slt i64 %indvars.iv, %tmp
+  br i1 %tmp3, label %bb4, label %bb11
+
+bb4:                                              ; preds = %bb2
+  %tmp5 = icmp sgt i64 %indvars.iv, %tmp1
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:                                              ; preds = %bb4
+  br label %bb8
+
+bb7:                                              ; preds = %bb4
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6
+  %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ]
+  %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %tmp.0, i32* %tmp9, align 4
+  br label %bb10
+
+bb10:                                             ; preds = %bb8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb2
+
+bb11:                                             ; preds = %bb2
+  ret void
+}
Index: polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll
+++ polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array  -polly-codegen < %s | FileCheck %s
+;
+;    void f(int *A, int c, int N) {
+;      int tmp;
+;      for (int i = 0; i < N; i++) {
+;        if (i > c)
+;          tmp = 3;
+;        else
+;          tmp = 5;
+;        A[i] = tmp;
+;      }
+;    }
+;
+; CHECK-LABEL: bb:
+; CHECK-DAG:   %tmp.0.s2a = alloca i32
+; CHECK-DAG:   %tmp.0.phiops = alloca i32
+; CHECK-LABEL: polly.stmt.bb8:
+; CHECK:       %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops
+; CHECK:       store i32 %tmp.0.phiops.reload, i32* %tmp.0.s2a
+; CHECK-LABEL: polly.stmt.bb8b:
+; CHECK:       %tmp.0.s2a.reload = load i32, i32* %tmp.0.s2a
+; CHECK:       store i32 %tmp.0.s2a.reload,
+; CHECK-LABEL: polly.stmt.bb6:
+; CHECK:       store i32 3, i32* %tmp.0.phiops
+; CHECK-LABEL: polly.stmt.bb7:
+; CHECK:       store i32 5, i32* %tmp.0.phiops
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32 %c, i32 %N) {
+bb:
+  %tmp = sext i32 %N to i64
+  %tmp1 = sext i32 %c to i64
+  br label %bb2
+
+bb2:                                              ; preds = %bb10, %bb
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ]
+  %tmp3 = icmp slt i64 %indvars.iv, %tmp
+  br i1 %tmp3, label %bb4, label %bb11
+
+bb4:                                              ; preds = %bb2
+  %tmp5 = icmp sgt i64 %indvars.iv, %tmp1
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:                                              ; preds = %bb4
+  br label %bb8
+
+bb7:                                              ; preds = %bb4
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6
+  %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ]
+  br label %bb8b
+
+bb8b:
+  %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %tmp.0, i32* %tmp9, align 4
+  br label %bb10
+
+bb10:                                             ; preds = %bb8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb2
+
+bb11:                                             ; preds = %bb2
+  ret void
+}
Index: polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll
+++ polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll
@@ -0,0 +1,72 @@
+; RUN: opt %loadPolly -analyze -polly-ast -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen < %s | FileCheck %s
+;
+;    void jd(int *A, int c) {
+;      for (int i = 0; i < 1024; i++) {
+;        if (c)
+;          A[i] = 1;
+;        else
+;          A[i] = 2;
+;      }
+;    }
+
+; AST:    for (int c0 = 0; c0 <= 1023; c0 += 1) {
+; AST:      if (c <= -1) {
+; AST:        Stmt_if_then(c0);
+; AST:      } else if (c >= 1) {
+; AST:        Stmt_if_then(c0);
+; AST:      } else
+; AST:        Stmt_if_else(c0);
+; AST:      Stmt_if_end(c0);
+; AST:    }
+;
+; CHECK-LABEL:  entry:
+; CHECK-NEXT:     %phi.phiops = alloca i32
+; CHECK-LABEL:  polly.stmt.if.end:
+; CHECK-NEXT:     %phi.phiops.reload = load i32, i32* %phi.phiops
+; CHECK-NEXT:     %scevgep
+; CHECK-NEXT:     store i32 %phi.phiops.reload, i32*
+; CHECK-LABEL:  polly.stmt.if.then:
+; CHECK-NEXT:     store i32 1, i32* %phi.phiops
+; CHECK-NEXT:     br label %polly.merge{{[.]?}}
+; CHECK-LABEL:  polly.stmt.if.then{{.}}:
+; CHECK-NEXT:     store i32 1, i32* %phi.phiops
+; CHECK-NEXT:     br label %polly.merge{{[.]?}}
+; CHECK-LABEL:  polly.stmt.if.else:
+; CHECK-NEXT:     store i32 2, i32* %phi.phiops
+; CHECK-NEXT:     br label %polly.merge{{[.]?}}
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @jd(i32* %A, i32 %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  br label %if.end
+
+if.else:
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ 1, %if.then], [ 2, %if.else ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %phi, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
Index: polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll
+++ polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll
@@ -0,0 +1,67 @@
+; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array  -polly-codegen < %s | FileCheck %s
+;
+;    float f(float *A, int N) {
+;      float tmp = 0;
+;      for (int i = 0; i < N; i++)
+;        tmp += A[i];
+;    }
+;
+; CHECK:      bb:
+; CHECK-NOT:    %tmp7{{[.*]}} = alloca float
+; CHECK-DAG:    %tmp.0.s2a = alloca float
+; CHECK-NOT:    %tmp7{{[.*]}} = alloca float
+; CHECK-DAG:    %tmp.0.phiops = alloca float
+; CHECK-NOT:    %tmp7{{[.*]}} = alloca float
+;
+; CHECK:      polly.merge_new_and_old:
+; CHECK-NEXT:  ret
+;
+; CHECK:      polly.start:
+; CHECK-NEXT:   store float 0.000000e+00, float* %tmp.0.phiops
+
+; CHECK:      polly.merge:
+; CHECK-NEXT:   br label %polly.merge_new_and_old
+
+; CHECK:      polly.stmt.bb1{{[0-9]*}}:
+; CHECK-NEXT:   %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops
+; CHECK:        store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a
+
+; CHECK:      polly.stmt.bb1{{[0-9]*}}:
+; CHECK-NEXT:   %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops
+; CHECK:        store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a
+
+; CHECK: polly.stmt.bb4:                                   ; preds = %polly.then3
+; CHECK:   %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2
+; CHECK:   %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a
+; CHECK:   %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_
+; CHECK:   store float %p_tmp[[R4]], float* %tmp.0.phiops
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* %A, i32 %N) {
+bb:
+  %tmp = sext i32 %N to i64
+  br label %bb1
+
+bb1:                                              ; preds = %bb4, %bb
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ]
+  %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ]
+  %tmp2 = icmp slt i64 %indvars.iv, %tmp
+  br i1 %tmp2, label %bb3, label %bb8
+
+bb3:                                              ; preds = %bb1
+  br label %bb4
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %tmp6 = load float, float* %tmp5, align 4
+  %tmp7 = fadd float %tmp.0, %tmp6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb1
+
+bb8:                                              ; preds = %bb1
+  br label %exit
+
+exit:
+  ret void
+}
Index: polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll
+++ polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll
@@ -0,0 +1,63 @@
+; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array  -polly-codegen < %s | FileCheck %s
+;
+;    float f(float *A, int N) {
+;      float tmp = 0;
+;      for (int i = 0; i < N; i++)
+;        tmp += A[i];
+;      return tmp;
+;    }
+;
+; CHECK:      polly.merge_new_and_old:
+; CHECK-NEXT:   %tmp.0.merge = phi float [ %tmp.0.final_reload, %polly.merge ], [ %tmp.0, %bb8 ]
+; CHECK-NEXT:   ret float %tmp.0.merge
+;
+; CHECK:      polly.start:
+; CHECK-NEXT:   store float 0.000000e+00, float* %tmp.0.phiops
+
+; CHECK:      polly.merge:
+; CHECK-NEXT:   %tmp.0.final_reload = load float, float* %tmp.0.s2a
+; CHECK-NEXT:   br label %polly.merge_new_and_old
+
+; CHECK:      polly.stmt.bb1{{[0-9]*}}:
+; CHECK-NEXT:   %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops
+; CHECK-:       store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a
+
+; CHECK:      polly.stmt.bb1{{[0-9]*}}:
+; CHECK-NEXT:   %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops
+; CHECK:        store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a
+
+; CHECK: polly.stmt.bb4:                                   ; preds = %polly.then3
+; CHECK:   %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2
+; CHECK:   %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a
+; CHECK:   %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_
+; CHECK:   store float %p_tmp[[R4]], float* %tmp.0.phiops
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define float @f(float* %A, i32 %N) {
+bb:
+  %tmp = sext i32 %N to i64
+  br label %bb1
+
+bb1:                                              ; preds = %bb4, %bb
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ]
+  %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ]
+  %tmp2 = icmp slt i64 %indvars.iv, %tmp
+  br i1 %tmp2, label %bb3, label %bb8
+
+bb3:                                              ; preds = %bb1
+  br label %bb4
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %tmp6 = load float, float* %tmp5, align 4
+  %tmp7 = fadd float %tmp.0, %tmp6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb1
+
+bb8:                                              ; preds = %bb1
+  br label %exit
+
+exit:
+  ret float %tmp.0
+}
Index: polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll
+++ polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll
@@ -0,0 +1,93 @@
+; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen < %s | FileCheck %s
+;
+;    int jd(int *restrict A, int x, int N) {
+;      for (int i = 1; i < N; i++)
+;        for (int j = 3; j < N; j++)
+;          x += A[i];
+;      return x;
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @jd(i32* noalias %A, i32 %x, i32 %N) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK-DAG:     %x.addr.1.lcssa.s2a = alloca i32
+; CHECK-DAG:     %x.addr.1.lcssa.phiops = alloca i32
+; CHECK-DAG:     %x.addr.1.s2a = alloca i32
+; CHECK-DAG:     %x.addr.1.phiops = alloca i32
+; CHECK-DAG:     %x.addr.0.s2a = alloca i32
+; CHECK-DAG:     %x.addr.0.phiops = alloca i32
+  %tmp = sext i32 %N to i64
+  br label %for.cond
+
+; CHECK-LABEL: polly.merge_new_and_old:
+; CHECK:         %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ]
+; CHECK:         ret i32 %x.addr.0.merge
+
+; CHECK-LABEL: polly.start:
+; CHECK-NEXT:    store i32 %x, i32* %x.addr.0.phiops
+
+; CHECK-LABEL: polly.merge:
+; CHECK:         %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a
+
+for.cond:                                         ; preds = %for.inc4, %entry
+; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}:
+; CHECK:         %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops
+; CHECK:         store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc4 ], [ 1, %entry ]
+  %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1.lcssa, %for.inc4 ]
+  %cmp = icmp slt i64 %indvars.iv, %tmp
+  br i1 %cmp, label %for.body, label %for.end6
+
+; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}:
+; CHECK:         %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops
+; CHECK:         store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a
+
+for.body:                                         ; preds = %for.cond
+; CHECK-LABEL: polly.stmt.for.body:
+; CHECK:         %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a
+; CHECK:         store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+; CHECK-LABEL: polly.stmt.for.end:
+; CHECK-NEXT:    %x.addr.1.lcssa.phiops.reload = load i32, i32* %x.addr.1.lcssa.phiops
+; CHECK-NEXT:    store i32 %x.addr.1.lcssa.phiops.reload, i32* %x.addr.1.lcssa.s2a[[R4:[0-9]*]]
+  %x.addr.1.lcssa = phi i32 [ %x.addr.1, %for.cond1 ]
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+; CHECK-LABEL: polly.stmt.for.inc4:
+; CHECK:         %x.addr.1.lcssa.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.lcssa.s2a[[R4]]
+; CHECK:         store i32 %x.addr.1.lcssa.s2a.reload[[R5]], i32* %x.addr.0.phiops
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+; CHECK-LABEL: polly.stmt.for.cond1:
+; CHECK:         %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops
+; CHECK:         store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a[[R6:[0-9]*]]
+; CHECK:         store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.lcssa.phiops
+  %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %add, %for.inc ]
+  %j.0 = phi i32 [ 3, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, %N
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+; CHECK-LABEL: polly.stmt.for.inc:
+; CHECK:         %x.addr.1.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.1.s2a
+; CHECK:         %p_add = add nsw i32 %x.addr.1.s2a.reload[[R3]], %tmp1_p_scalar_
+; CHECK:         store i32 %p_add, i32* %x.addr.1.phiops
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %x.addr.1, %tmp1
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end6:                                         ; preds = %for.cond
+  ret i32 %x.addr.0
+}
Index: polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll
===================================================================
--- polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll
+++ polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll
@@ -0,0 +1,108 @@
+; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen < %s | FileCheck %s
+;
+;    int jd(int *restrict A, int x, int N, int c) {
+;      for (int i = 0; i < N; i++)
+;        for (int j = 0; j < N; j++)
+;          if (i < c)
+;            x += A[i];
+;      return x;
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @jd(i32* noalias %A, i32 %x, i32 %N, i32 %c) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK-DAG:     %x.addr.2.s2a = alloca i32
+; CHECK-DAG:     %x.addr.2.phiops = alloca i32
+; CHECK-DAG:     %x.addr.1.s2a = alloca i32
+; CHECK-DAG:     %x.addr.1.phiops = alloca i32
+; CHECK-DAG:     %x.addr.0.s2a = alloca i32
+; CHECK-DAG:     %x.addr.0.phiops = alloca i32
+  %tmp = sext i32 %N to i64
+  %tmp1 = sext i32 %c to i64
+  br label %for.cond
+
+; CHECK-LABEL: polly.merge_new_and_old:
+; CHECK:         %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ]
+; CHECK:         ret i32 %x.addr.0.merge
+
+; CHECK-LABEL: polly.start:
+; CHECK-NEXT:    store i32 %x, i32* %x.addr.0.phiops
+
+; CHECK-LABEL: polly.merge:
+; CHECK:         %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a
+
+for.cond:                                         ; preds = %for.inc5, %entry
+; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}:
+; CHECK:         %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops
+; CHECK:         store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc5 ], [ 0, %entry ]
+  %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1, %for.inc5 ]
+  %cmp = icmp slt i64 %indvars.iv, %tmp
+  br i1 %cmp, label %for.body, label %for.end7
+
+; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}:
+; CHECK:         %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops
+; CHECK:         store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a
+
+for.body:                                         ; preds = %for.cond
+; CHECK-LABEL: polly.stmt.for.body:
+; CHECK:         %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a
+; CHECK:         store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops
+  br label %for.cond1
+
+for.inc5:                                         ; preds = %for.end
+; CHECK-LABEL: polly.stmt.for.inc5:
+; CHECK:         %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a
+; CHECK:         store i32 %x.addr.1.s2a.reload[[R5]], i32* %x.addr.0.phiops
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+; CHECK-LABEL: polly.stmt.for.cond1:
+; CHECK:         %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops
+; CHECK:         store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a
+  %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %x.addr.2, %for.inc ]
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, %N
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+; CHECK-LABEL: polly.stmt.for.body3:
+; CHECK:  %x.addr.1.s2a.reload = load i32, i32* %x.addr.1.s2a
+; CHECK:  store i32 %x.addr.1.s2a.reload, i32* %x.addr.2.phiops
+  %cmp4 = icmp slt i64 %indvars.iv, %tmp1
+  br i1 %cmp4, label %if.then, label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body3
+; CHECK-LABEL: polly.stmt.if.end:
+; CHECK:         %x.addr.2.phiops.reload = load i32, i32* %x.addr.2.phiops
+; CHECK:         store i32 %x.addr.2.phiops.reload, i32* %x.addr.2.s2a
+  %x.addr.2 = phi i32 [ %add, %if.then ], [ %x.addr.1, %for.body3 ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+; CHECK-LABEL: polly.stmt.for.inc:
+; CHECK:         %x.addr.2.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.2.s2a
+; CHECK:         store i32 %x.addr.2.s2a.reload[[R3]], i32* %x.addr.1.phiops
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+if.then:                                          ; preds = %for.body3
+; CHECK-LABEL: polly.stmt.if.then:
+; CHECK:         %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a
+; CHECK:         %p_add = add nsw i32 %x.addr.1.s2a.reload[[R5]], %tmp2_p_scalar_
+; CHECK:         store i32 %p_add, i32* %x.addr.2.phiops
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %x.addr.1, %tmp2
+  br label %if.end
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc5
+
+for.end7:                                         ; preds = %for.cond
+  ret i32 %x.addr.0
+}
+