Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -1325,6 +1325,51 @@
   return false;
 }
 
+static bool checkForStreamingAccesses(const std::vector<Reference> &References) {
+
+  for(auto it = References.begin(); it != References.end(); it++) {
+    if(it->Type != Reference::CHUNK && it->Type != Reference::SINGLE_ELEMENT)
+      return false;
+  }
+ 
+  assert(References.size() != 1);
+ 
+  for(std::size_t i = 1; i < References.size(); ++i) {
+    if(!std::equal(References[0].LoopOrder.begin(),References[0].LoopOrder.end(),
+             References[i].LoopOrder.begin()))
+      return false;
+  }
+  
+  return true;
+
+}
+
+/// @param Node. Check if the node contains a streaming kernel.
+/// @param SK. The Skeleton array containing all the references for all the Stmts.
+/// 1. The skeleton should look like: CHUNK -> CHUNK or ELEMENT -> ELEMENT.
+/// 2. The partial index order should be the same.
+/// 3. TODO:It should have a stride in the innermost dimension.
+/// 4. TODO: Dependencies?
+
+static bool isStreaming(isl::schedule_node Node, const std::vector<Skeleton> &SK) {
+  isl::union_map PartialSchedule = isl::manage(
+    isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
+  if(PartialSchedule.n_map() != 1)
+    return false;
+  isl::map NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
+  isl::id InputDimsId = NewPartialSchedule.get_tuple_id(isl::dim::in);
+  ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+  bool Streaming = false;
+  std::vector<Skeleton>::const_iterator it = SK.begin();
+  for(; it != SK.end(); it++) {
+    if(strcmp(it->StmtBaseName, Stmt->getBaseName()) != 0)
+      continue;
+    if(checkForStreamingAccesses(it->References))
+      Streaming = true;
+  } 
+  return Streaming;
+}
+
 __isl_give isl_schedule_node *
 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
                                     void *User) {
@@ -1334,6 +1379,12 @@
   const OptimizerAdditionalInfoTy *OAI =
       static_cast<const OptimizerAdditionalInfoTy *>(User);
 
+  /// avoid tiling.
+  if(isStreaming(isl::manage_copy(Node), OAI->Skeletons)) {
+    DEBUG(dbgs() << "Streaming detected\n");
+    return Node;
+  }
+
   MatMulInfoTy MMI;
   if (PMBasedOpts && User &&
       isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) {
@@ -1467,6 +1518,431 @@
       &Version);
 }
 
+/// Get the domain based on the bounds.
+
+/// @param MemAcc. The memory access for which we want to 
+/// compute the domain w.r.t. loop bounds.
+/// @param Dom. The resulting domain.
+
+static void getDomain(MemoryAccess *MemAcc, isl::set &Dom) {
+
+  auto ArrayInfo = MemAcc->getLatestScopArrayInfo();
+  unsigned NumDims = ArrayInfo->getNumberOfDimensions();
+
+  if (NumDims == 0)
+    Dom = isl::set::universe(ArrayInfo->getSpace());
+
+  Scop *S = ArrayInfo->getScop();
+  isl::union_map Accesses = S->getAccesses();
+  isl::union_map sched = ArrayInfo->getScop()->getSchedule();
+  
+  Accesses = Accesses.apply_domain(sched);
+
+  isl::union_set AccessRange = Accesses.range();
+  AccessRange = AccessRange.coalesce();
+  AccessRange = AccessRange.detect_equalities();
+  AccessRange = AccessRange.coalesce();
+
+  if (AccessRange.is_empty())
+    Dom = isl::set::empty(ArrayInfo->getSpace());
+
+  isl::set AccessSet = AccessRange.extract_set(ArrayInfo->getSpace());
+  isl::local_space LS = isl::local_space(ArrayInfo->getSpace());
+  Dom = isl::set::universe(ArrayInfo->getSpace());
+
+  for (unsigned i = 0; i < NumDims; ++i) {
+    isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, i);
+    isl::pw_aff OuterMin = AccessSet.dim_min(i);
+    isl::pw_aff OuterMax = AccessSet.dim_max(i);
+    OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in));
+    OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in));
+    OuterMin = OuterMin.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId());
+    OuterMax = OuterMax.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId());
+    Dom = Dom.intersect(OuterMin.le_set(Val));
+    Dom = Dom.intersect(OuterMax.ge_set(Val));
+  }
+}
+
+///NOTE: At the moment I am not able to quantify the step.
+/// so we push a '1' if we detect a step X along one loop dimension, 0 otherwise.
+
+/// get the step.
+
+/// @param MemAcc. The memory access for which we want to compute the step.
+/// @param Sched. The Scop schedule.
+/// @param HasStride. The vector to be filled. 1 if we detect a stride (step)
+/// along a dimension, 0 otherwise. The dimensions (loop indexes) are ordered from outermost
+/// to innermost.
+
+static void getStep(MemoryAccess *MemAcc, isl::map &Sched,
+                    /*unsigned LoopLevel,*/ std::vector<bool> &HasStride) {
+
+  unsigned DimNumOut = Sched.dim(isl::dim::out);
+  unsigned StrideDetected = false;
+  for (unsigned uu = 0; uu < DimNumOut; uu++) {
+    StrideDetected = false;
+    // the loop for which you want to compute the step
+    // should be the innermost one.
+    isl::map NewSched =
+        permuteDimensions(Sched, isl::dim::out, uu, DimNumOut - 1);
+    isl::set Deltas = MemAcc->getStride(NewSched);
+    //DEBUG(dbgs() << "DELTAS@ " << Deltas << "\n");
+    isl::pw_multi_aff MultAff = isl::pw_multi_aff::from_set(Deltas);
+    if (MultAff.n_piece() == 0)
+      continue;
+    if (MultAff.n_piece() > 1)
+      continue;
+    //DEBUG(dbgs() << MultAff << "\n");
+    for (unsigned u = 0; u < MultAff.dim(isl::dim::out); ++u) {
+      isl::pw_aff PwAff = MultAff.get_pw_aff(u);
+      PwAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat {
+        isl::val Val = Aff.get_constant_val();
+        if (!Val.is_zero()) {
+          //DEBUG(dbgs() << "Stride on dimension :" << uu << "\n");
+          HasStride.push_back(true);
+          StrideDetected = true;
+        }
+        return isl::stat::ok;
+      });
+    }
+    if (!StrideDetected)
+      HasStride.push_back(false);
+  }
+ }
+
+/// Find the element accessed by each memory access
+/// for every iteration of the outermost loop.
+/// In addition, the function 
+/// records the relative indexes order.
+/// Three cases:
+/// 1. The access is dependent on the outer loops
+/// 2. The access is dependent on the inner loops
+/// 3. The access is not dependent on any loops
+
+/// @param MemAcc. The memory access for which we want to compute
+/// the number of element(s) accessed.
+/// @param Sched. The Scop schedule.
+/// @param ElementAccessed. The array to be filled with the number
+/// of elements accessed every iteration for each loop index.
+/// @param LoopOrder. The array to be filled with the relative loop
+/// indexes order. 
+
+static void getElementAccessed(MemoryAccess *MemAcc, isl::map &Sched,
+                               isl::set &AccessSet,
+                               std::vector<isl::val> &ElementAccessed,
+                               std::vector<unsigned> &LoopOrder) {
+
+  isl::map AccMap = MemAcc->getLatestAccessRelation();
+  AccMap = AccMap.intersect_domain(MemAcc->getStatement()->getDomain());
+  isl::pw_multi_aff MultiAff = isl::pw_multi_aff::from_map(AccMap);
+  unsigned IndexAccessMap = 0;
+
+  /// iterate across the dimensions of the accessed memory location.
+  for (unsigned u = 0; u < AccMap.dim(isl::dim::out); ++u) {
+    isl::pw_aff PwAff = MultiAff.get_pw_aff(u);
+    PwAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat {
+      bool Relate = false;
+      /// check if a dimension relates to a loop index.
+      for (unsigned LoopDim = 0; LoopDim < AccMap.dim(isl::dim::in);
+           ++LoopDim) {
+        isl::val V = Aff.get_coefficient_val(isl::dim::in, LoopDim);
+        if (!V.is_zero()) {
+          isl::pw_aff OuterMin, OuterMax;
+          if (LoopDim == 0) {
+            // DEBUG(dbgs() << "dimension relates to outer loops" << "\n");
+            isl::local_space Ls =
+                isl::local_space(AccessSet.get_space().params());
+            isl::aff One = isl::aff(Ls);
+            One = One.add_constant_si(1);
+            isl::val Val = One.get_constant_val();
+            ElementAccessed.push_back(Val);
+          } else {
+            // DEBUG(dbgs() << "dimension relates to inner loops" << "\n");
+            OuterMin = AccessSet.dim_min(IndexAccessMap);
+            OuterMax = AccessSet.dim_max(IndexAccessMap);
+            OuterMax = OuterMax.sub(OuterMin);
+        
+            assert(OuterMax.n_piece() == 1);
+            OuterMax.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat {
+              isl::val Val = Aff.get_constant_val();
+              ElementAccessed.push_back(Val);
+              return isl::stat::ok;
+            });
+          }
+          LoopOrder.push_back(LoopDim);
+          Relate = true;
+          IndexAccessMap++;
+        }
+        // The dimension does not relate.
+        else if (V.is_zero()) {
+          if (!Relate && LoopDim == AccMap.dim(isl::dim::in) - 1) {
+            // DEBUG(dbgs() << "found dimension that does not relate to any loop
+            // index\n");
+            isl::local_space Ls =
+                isl::local_space(AccessSet.get_space().params());
+            isl::aff One = isl::aff(Ls);
+            One = One.add_constant_si(1);
+            isl::val Val = One.get_constant_val();
+            ElementAccessed.push_back(Val);
+          }
+        }
+      }
+      return isl::stat::ok;
+    });
+  }
+}
+
+
+/// check if only a single element is accessed.
+
+static bool accessSingleElement(std::vector<isl::val> &E) {
+  bool isElement = true;
+  std::vector<isl::val>::iterator it = E.begin();
+  while (it != E.end()) {
+    if (!it->is_one())
+      isElement = false;
+    it++;
+  }
+  return isElement;
+}
+
+/// extract the species for the given array of memory references.
+/// @param the array of memory references to be classified.
+
+static void extractSpecies(std::vector<Reference> &R /*, int LoopLevel*/) {
+  assert(!R.empty() && "Structure is empty");
+  std::vector<Reference>::iterator it = R.begin();
+  while (it != R.end()) {
+    if (!it->HasStride[0] && it->AccessType == 1) {
+      it->Type = Reference::FULL;
+      // DEBUG(dbgs() << "[ " << it->Name << " ]" << " FULL " << "\n");
+    } else if (!it->HasStride[0] && it->AccessType == 2) {
+      // DEBUG(dbgs() << it->Step[0].is_zero() << "\n");
+      it->Type = Reference::SHARED;
+      // DEBUG(dbgs() << "[ " << it->Name << " ]" << " SHARED " << "\n");
+    }
+
+    else if (accessSingleElement(it->ElementAccessed)) {
+      it->Type = Reference::SINGLE_ELEMENT;
+      // DEBUG(dbgs() << "[ " << it->Name << " ]" << " ELEMENT " << "\n");
+    }
+
+    else {
+      it->Type = Reference::CHUNK;
+      // DEBUG(dbgs() << "[ " << it->Name << " ]" << " CHUNK " << "\n");
+    }
+    it++;
+  }
+}
+
+/// helper function. To be removed.
+
+static std::string getTypeAsString(int Index) {
+  switch (Index) {
+  case 0:
+    return "SINGLE_ELEMENT";
+  case 1:
+    return "CHUNK";
+  case 2:
+    return "NEIGHBORHOOD";
+  case 3:
+    return "FULL";
+  case 4:
+    return "SHARED";
+  default:
+    return "ND";
+  }
+}
+
+/// helper function. to be removed.
+
+static void printStructure(std::vector<Reference> &R, const char *StmtBaseName) {
+  assert(!R.empty() && "Structure is empty");
+  // DEBUG(dbgs() << R.size() << "\n");
+  DEBUG(dbgs() << "Stmt Name := " << StmtBaseName << "\n");
+  std::vector<Reference>::iterator it = R.begin();
+  while (it != R.end()) {
+    if (it->AccessType == 2)
+      DEBUG(dbgs() << "[" << it->Name << " " << getTypeAsString(it->Type) << "]"
+                   << " -> ");
+    it++;
+  }
+  it = R.begin();
+  while (it != R.end()) {
+    if (it->AccessType != 2)
+      DEBUG(dbgs() << "[" << it->Name << " " << getTypeAsString(it->Type)
+                   << "]");
+    it++;
+  }
+  DEBUG(dbgs() << "\n");
+  while (!R.empty()) {
+    Reference RR = R.back();
+    DEBUG(dbgs() << "AccessType@: " << RR.AccessType << "\n");
+    DEBUG(dbgs() << "Name@: " << RR.Name << "\n");
+    DEBUG(dbgs() << "Domain@: " << RR.Domain << "\n");
+    DEBUG(dbgs() << "ElementAccessed for each outer loop iteration :=");
+    while (!RR.ElementAccessed.empty()) {
+      isl::val Val = RR.ElementAccessed.front();
+      DEBUG(dbgs() << "[" << Val.to_str() << "]");
+      RR.ElementAccessed.erase(RR.ElementAccessed.begin());
+    }
+    DEBUG(dbgs() << "\n");
+    DEBUG(dbgs() << "StrideOnDimension :=");
+    while (!RR.HasStride.empty()) {
+      bool Val = RR.HasStride.front();
+      DEBUG(dbgs() << "[" << Val << "]");
+      RR.HasStride.erase(RR.HasStride.begin());
+    }
+    DEBUG(dbgs() << "\n");
+    DEBUG(dbgs() << "Partial loop order :=");
+    while (!RR.LoopOrder.empty()) {
+      DEBUG(dbgs() << "[" << RR.LoopOrder.front() << "]");
+      RR.LoopOrder.erase(RR.LoopOrder.begin());
+    }
+    DEBUG(dbgs() << "\n");
+    R.pop_back();
+  }
+}
+
+
+/// Collect the 5-tuples.
+/// 5-tuples classification:
+/// 1. Reference name
+/// 2. Type of access (read or write)
+/// 3. Integer domain of the array reference w.r.t. the loop bounds
+/// 4. Number of element accessed per outer loop iteration.
+/// 5. Step
+
+/// @param Stmt. The Stmt to be classified.
+/// @param References. The vector that will be filled with
+/// all the references detected in the Stmt and the 5-tuples information.
+
+static void classifyReference(ScopStmt &Stmt,
+                              std::vector<Reference> &References) {
+
+  auto Accesses = getAccessesInOrder(Stmt);
+
+  /// Used to populate References.
+  int Index = 0;
+
+  for (auto *MemA = Accesses.begin(); MemA != Accesses.end(); MemA++) {
+
+    auto *MemAccessPtr = *MemA;
+
+    if (MemAccessPtr->isMemoryIntrinsic())
+      continue;
+    
+
+    /*
+        enum MemoryKind MK = MemAccessPtr->getOriginalKind();
+        if(MK == MemoryKind::Array) {
+          DEBUG(dbgs() << "Array kind\n");
+          DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n");
+        }
+        if(MK == MemoryKind::Value) {
+          DEBUG(dbgs() << "Value kind\n");
+          DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n");
+        }
+        if(MK == MemoryKind::PHI) {
+          DEBUG(dbgs() << "PHI kind\n");
+          DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n");
+          auto PHI = cast<PHINode>(MemAccessPtr->getAccessInstruction());
+          PHI->dump();
+        }
+        if(MK == MemoryKind::ExitPHI) {
+          DEBUG(dbgs() << "Exit kind\n");
+          DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n");
+        }
+    */
+
+    enum MemoryKind MK = MemAccessPtr->getOriginalKind();
+    if (MK != MemoryKind::Array)
+      continue;
+
+    /// 5-tuples classification.
+
+    References.push_back(Reference());
+
+    /// get access type. 
+    References[Index].AccessType = MemAccessPtr->getType();
+  
+    /// get reference name. 
+    References[Index].Name = MemAccessPtr->getLatestScopArrayInfo()->getName();
+
+    // unsigned LoopLevel = 0;
+    /// get domain w.r.t loop bounds
+    getDomain(MemAccessPtr, References[Index].Domain);
+    isl::map ScheduleMap = Stmt.getSchedule();
+    /// get elements accessed per outer loop iteration.
+    getElementAccessed(MemAccessPtr, ScheduleMap, /*LoopLevel,*/
+                       References[Index].Domain,
+                       References[Index].ElementAccessed,
+                       References[Index].LoopOrder);
+    /// get the step.
+    getStep(MemAccessPtr, ScheduleMap,
+            /*LoopLevel,*/ References[Index].HasStride);
+    Index++;
+  }
+}
+
+/// Check Stmt profitability.
+/// TODO: find a better profitability check.
+/// @param Stmt is a given statement. Returns true if the
+/// statement is deemed profitable to be classified, false otherwise.
+
+static bool isProfitableStmt(ScopStmt &Stmt) {
+
+  if (Stmt.isEmpty()) {
+    DEBUG(dbgs() << "Profitability: Empty Stmt"
+                 << "\n");
+    return false;
+  }
+
+  /// TODO: find a better profitability check.
+  /// maybe it is better to count the number of instructions (i.e. load/store).
+  if (getAccessesInOrder(Stmt).size() <= 2) {
+    DEBUG(dbgs() << "Profitability: Few Mem. references"
+                 << "\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "Profitability: Stmt is profitable"
+               << "\n");
+  return true;
+}
+
+/// Collect the statements for the species classification.
+/// Refer to the following paper for more details:
+/// Cedric Nugteren, Pieter Custers, and Henk Corporaal. 2013. 
+/// Algorithmic species: A classification of affine loop nests for parallel programming. 
+/// ACM Trans. Archit. Code Optim. 9, 4, Article 40 (January 2013), 25 pages. 
+/// DOI=http://dx.doi.org/10.1145/2400682.2400699
+/// or
+/// https://docs.google.com/document/d/1OOMg37pXI3bBkHMAtQsfszYwJ6-GvpO9e4zQ2qf44tM/edit
+ 
+/// @param Scop. The Scop to be analyzed.
+/// @param Skeleton vector. For each statement in the Scop
+/// we collect all the access references.
+/// The access references are used later on to extract the species
+/// of the statement based on the 5-tuples classification.
+
+static void collectStmtForClassification(Scop &S, std::vector<Skeleton> &SK) {
+  int Index = 0;
+  for (auto &Stmt : S) {
+    if (isProfitableStmt(Stmt)) {
+      SK.push_back(Skeleton());
+      SK[Index].StmtBaseName = Stmt.getBaseName(); 
+      classifyReference(Stmt, SK[Index].References);
+      extractSpecies(SK[Index].References);
+      //printStructure(SK[Index].References, SK[Index].StmtBaseName);
+      Index++;
+    } else {
+      DEBUG(dbgs() << "Skip Stmt"
+                   << "\n");
+    }
+  }
+}
+
 bool IslScheduleOptimizer::runOnScop(Scop &S) {
   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
   if (S.isToBeSkipped())
@@ -1619,10 +2095,15 @@
 
   Function &F = S.getFunction();
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
+  std::vector<Skeleton> Skeletons = std::vector<Skeleton>();
+  collectStmtForClassification(S, Skeletons);
+  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D), Skeletons};
   auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
   walkScheduleTreeForStatistics(NewSchedule, 2);
 
+  for(std::size_t i = 0; i != Skeletons.size(); ++i)
+    printStructure(Skeletons[i].References, Skeletons[i].StmtBaseName);
+
   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule))
     return false;