Index: lib/Transform/MaximalStaticExpansion.cpp =================================================================== --- lib/Transform/MaximalStaticExpansion.cpp +++ lib/Transform/MaximalStaticExpansion.cpp @@ -76,12 +76,20 @@ /// Expand the read memory access. /// - /// @param The SCop in which the memory access appears in. - /// @param The memory access that need to be expanded. + /// @param S The SCop in which the memory access appears in. + /// @param MA The memory access that need to be expanded. /// @param Dependences The RAW dependences of the SCop. /// @param ExpandedSAI The expanded SAI created during write expansion. void expandRead(Scop &S, MemoryAccess *MA, isl::union_map &Dependences, ScopArrayInfo *ExpandedSAI); + + /// Filter the dependences to have only one related to current memory access. + /// + /// @param S The SCop in which the memory access appears in. + /// @param MapDependences The dependences to filter. + /// @param MA The memory access that need to be expanded. + isl::union_map filterDependences(Scop &S, isl::union_map MapDependences, + MemoryAccess *MA); }; } // namespace @@ -146,11 +154,60 @@ char MaximalStaticExpander::ID = 0; +isl::union_map +MaximalStaticExpander::filterDependences(Scop &S, isl::union_map Dependences, + MemoryAccess *MA) { + + isl::union_map MapDependences = isl::union_map::empty(S.getParamSpace()); + + Dependences.reverse().foreach_map( + [&MapDependences, &MA](isl::map Map) -> isl::stat { + + // Filter out Statement to Statement dependences. + if (!Map.can_curry()) + return isl::stat::ok; + + // Intersect with the relevant SAI. + auto SAI = MA->getLatestScopArrayInfo(); + auto Id = SAI->getBasePtrId(); + auto MARangeId = MA->getLatestAccessRelation().range().get_tuple_id(); + + auto TmpMap = Map.curry().range().wrapped_domain_map().curry(); + auto TmpMapDomainId = TmpMap.domain().get_tuple_id(); + + ScopArrayInfo *UserSAI = + static_cast(TmpMapDomainId.get_user()); + + if (SAI != UserSAI) + return isl::stat::ok; + + // Get the correct S1[] -> S2[] dependence. + auto NewMap = Map.factor_domain(); + auto NewMapDomainId = NewMap.domain().get_tuple_id(); + auto ReadDomainSet = MA->getAccessRelation().domain(); + auto ReadDomainId = ReadDomainSet.get_tuple_id(); + + if (ReadDomainId.keep() != NewMapDomainId.keep()) + return isl::stat::ok; + + // Add the corresponding map to MapDependences. + MapDependences = MapDependences.add_map(NewMap); + + return isl::stat::ok; + }); + + return MapDependences; +} + bool MaximalStaticExpander::isExpandable( const ScopArrayInfo *SAI, SmallPtrSetImpl &Writes, SmallPtrSetImpl &Reads, Scop &S, isl::union_map &Dependences) { + // Union_map needed to check if there are load after store in same statement. + auto Stores = isl::union_map::empty(S.getParamSpace()); + auto Loads = isl::union_map::empty(S.getParamSpace()); + int NumberWrites = 0; for (ScopStmt &Stmt : S) { for (MemoryAccess *MA : Stmt) { @@ -166,6 +223,23 @@ return false; } + // For now, we are not able to expand array where load come after store + // (to the same location) in a same statement. + auto AccRel = isl::union_map(MA->getAccessRelation()); + if (MA->isRead()) { + // Reject load after store to same location. + if (!Stores.is_disjoint(AccRel)) { + emitRemark(SAI->getName() + " has load after store to the same " + "element in same statement.", + MA->getAccessInstruction()); + return false; + } + + Loads = give(isl_union_map_union(Loads.take(), AccRel.take())); + } else { + Stores = give(isl_union_map_union(Stores.take(), AccRel.take())); + } + // For now, we are not able to expand MayWrite. if (MA->isMayWrite()) { emitRemark(SAI->getName() + " has a maywrite access.", @@ -191,15 +265,13 @@ auto StmtDomain = Stmt.getDomain(); // Get the domain of the future Read access. - auto ReadDomainSet = MA->getAccessRelation().domain(); auto ReadDomain = isl::union_set(ReadDomainSet); - auto CurrentReadWriteDependences = - Dependences.reverse().intersect_domain(ReadDomain); - auto DepsDomain = CurrentReadWriteDependences.domain(); - unsigned NumberElementMap = - isl_union_map_n_map(CurrentReadWriteDependences.get()); + // Get the dependences related to MA + auto MapDependences = filterDependences(S, Dependences, MA); + auto DepsDomain = MapDependences.domain(); + unsigned NumberElementMap = isl_union_map_n_map(MapDependences.get()); // If there are multiple maps in the Deps, we cannot handle this case // for now. @@ -246,17 +318,16 @@ auto WriteDomainSet = MA->getAccessRelation().domain(); auto WriteDomain = isl::union_set(WriteDomainSet); - auto CurrentReadWriteDependences = - Dependences.reverse().intersect_domain(WriteDomain); + // Get dependences related to MA + auto MapDependences = filterDependences(S, Dependences, MA); // If no dependences, no need to modify anything. - if (CurrentReadWriteDependences.is_empty()) { + if (MapDependences.is_empty()) return; - } - assert(isl_union_map_n_map(CurrentReadWriteDependences.get()) == 1 && + assert(isl_union_map_n_map(MapDependences.get()) == 1 && "There are more than one RAW dependencies in the union map."); - auto NewAccessMap = isl::map::from_union_map(CurrentReadWriteDependences); + auto NewAccessMap = isl::map::from_union_map(MapDependences); auto Id = ExpandedSAI->getBasePtrId(); @@ -348,7 +419,7 @@ // Get the RAW Dependences. auto &DI = getAnalysis(); - auto &D = DI.getDependences(Dependences::AL_Statement); + auto &D = DI.getDependences(Dependences::AL_Reference); auto Dependences = isl::give(D.getDependences(Dependences::TYPE_RAW)); SmallPtrSet CurrentSAI(S.arrays().begin(), Index: test/MaximalStaticExpansion/load_after_store_same_statement.ll =================================================================== --- /dev/null +++ test/MaximalStaticExpansion/load_after_store_same_statement.ll @@ -0,0 +1,95 @@ +; RUN: opt %loadPolly -polly-canonicalize -polly-mse -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-canonicalize -polly-mse -pass-remarks-analysis="polly-mse" -analyze < %s 2>&1| FileCheck %s --check-prefix=MSE +; +; Verify that the expansion of an array with load after store in a same statement is not done. +; +; Original source code : +; +; #define Ni 2000 +; #define Nj 3000 +; +; void mse(double A[Ni], double B[Nj], double C[Nj], double D[Nj]) { +; int i,j; +; for (i = 0; i < Ni; i++) { +; for (int j = 0; j MemRef_C_Stmt_for_body4_expanded[i0, i1] }; +; +; Check that B is not expanded +; +; CHECK-NOT: double MemRef_B_Stmt_for_body4_expanded[10000][10000]; // Element size 8 +; MSE: MemRef_B has load after store to the same element in same statement. +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @mse(double* %A, double* %B, double* %C, double* %D) { +entry: + %A.addr = alloca double*, align 8 + %B.addr = alloca double*, align 8 + %C.addr = alloca double*, align 8 + %D.addr = alloca double*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %j1 = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store double* %B, double** %B.addr, align 8 + store double* %C, double** %C.addr, align 8 + store double* %D, double** %D.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond +for.cond: ; preds = %for.inc9, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end11 +for.body: ; preds = %for.cond + store i32 0, i32* %j1, align 4 + br label %for.cond2 +for.cond2: ; preds = %for.inc, %for.body + %1 = load i32, i32* %j1, align 4 + %cmp3 = icmp slt i32 %1, 10000 + br i1 %cmp3, label %for.body4, label %for.end +for.body4: ; preds = %for.cond2 + %2 = load i32, i32* %j1, align 4 + %conv = sitofp i32 %2 to double + %3 = load double*, double** %B.addr, align 8 + %4 = load i32, i32* %j1, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds double, double* %3, i64 %idxprom + store double %conv, double* %arrayidx, align 8 + %5 = load double*, double** %B.addr, align 8 + %6 = load i32, i32* %j1, align 4 + %idxprom5 = sext i32 %6 to i64 + %arrayidx6 = getelementptr inbounds double, double* %5, i64 %idxprom5 + %7 = load double, double* %arrayidx6, align 8 + %8 = load double*, double** %C.addr, align 8 + %9 = load i32, i32* %j1, align 4 + %idxprom7 = sext i32 %9 to i64 + %arrayidx8 = getelementptr inbounds double, double* %8, i64 %idxprom7 + store double %7, double* %arrayidx8, align 8 + br label %for.inc +for.inc: ; preds = %for.body4 + %10 = load i32, i32* %j1, align 4 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %j1, align 4 + br label %for.cond2 +for.end: ; preds = %for.cond2 + br label %for.inc9 +for.inc9: ; preds = %for.end + %11 = load i32, i32* %i, align 4 + %inc10 = add nsw i32 %11, 1 + store i32 %inc10, i32* %i, align 4 + br label %for.cond +for.end11: ; preds = %for.cond + ret void +} + Index: test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll =================================================================== --- /dev/null +++ test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll @@ -0,0 +1,135 @@ +; RUN: opt %loadPolly -polly-canonicalize -polly-mse -analyze < %s | FileCheck %s +; +; Verify that the accesses are correctly expanded +; +; Original source code : +; +; #define Ni 2000 +; #define Nj 3000 +; +; void mse(double A[Ni], double B[Nj], double C[Nj], double D[Nj]) { +; int i,j; +; for (j = 0; j < Ni; j++) { +; for (int i = 0; i MemRef_B_Stmt_for_body4_expanded[i0, i1] }; +; CHECK: new: { Stmt_for_body9[i0, i1] -> MemRef_D_Stmt_for_body9_expanded[i0, i1] }; +; CHECK: new: { Stmt_for_end15[i0] -> MemRef_B_Stmt_for_body4_expanded[i0, i0] }; +; CHECK: new: { Stmt_for_end15[i0] -> MemRef_A_Stmt_for_end15_expanded[i0] }; +; CHECK: new: { Stmt_for_end15[i0] -> MemRef_D_Stmt_for_body9_expanded[i0, i0] }; +; CHECK: new: { Stmt_for_end15[i0] -> MemRef_C_Stmt_for_end15_expanded[i0] }; +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @mse(double* %A, double* %B, double* %C, double* %D) { +entry: + %A.addr = alloca double*, align 8 + %B.addr = alloca double*, align 8 + %C.addr = alloca double*, align 8 + %D.addr = alloca double*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %i1 = alloca i32, align 4 + %i5 = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store double* %B, double** %B.addr, align 8 + store double* %C, double** %C.addr, align 8 + store double* %D, double** %D.addr, align 8 + store i32 0, i32* %j, align 4 + br label %for.cond +for.cond: ; preds = %for.inc24, %entry + %0 = load i32, i32* %j, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end26 +for.body: ; preds = %for.cond + store i32 0, i32* %i1, align 4 + br label %for.cond2 +for.cond2: ; preds = %for.inc, %for.body + %1 = load i32, i32* %i1, align 4 + %cmp3 = icmp slt i32 %1, 10000 + br i1 %cmp3, label %for.body4, label %for.end +for.body4: ; preds = %for.cond2 + %2 = load i32, i32* %i1, align 4 + %conv = sitofp i32 %2 to double + %3 = load double*, double** %B.addr, align 8 + %4 = load i32, i32* %i1, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds double, double* %3, i64 %idxprom + store double %conv, double* %arrayidx, align 8 + br label %for.inc +for.inc: ; preds = %for.body4 + %5 = load i32, i32* %i1, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i1, align 4 + br label %for.cond2 +for.end: ; preds = %for.cond2 + store i32 0, i32* %i5, align 4 + br label %for.cond6 +for.cond6: ; preds = %for.inc13, %for.end + %6 = load i32, i32* %i5, align 4 + %cmp7 = icmp slt i32 %6, 10000 + br i1 %cmp7, label %for.body9, label %for.end15 +for.body9: ; preds = %for.cond6 + %7 = load i32, i32* %i5, align 4 + %conv10 = sitofp i32 %7 to double + %8 = load double*, double** %D.addr, align 8 + %9 = load i32, i32* %i5, align 4 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds double, double* %8, i64 %idxprom11 + store double %conv10, double* %arrayidx12, align 8 + br label %for.inc13 +for.inc13: ; preds = %for.body9 + %10 = load i32, i32* %i5, align 4 + %inc14 = add nsw i32 %10, 1 + store i32 %inc14, i32* %i5, align 4 + br label %for.cond6 +for.end15: ; preds = %for.cond6 + %11 = load double*, double** %B.addr, align 8 + %12 = load i32, i32* %j, align 4 + %idxprom16 = sext i32 %12 to i64 + %arrayidx17 = getelementptr inbounds double, double* %11, i64 %idxprom16 + %13 = load double, double* %arrayidx17, align 8 + %14 = load double*, double** %A.addr, align 8 + %15 = load i32, i32* %j, align 4 + %idxprom18 = sext i32 %15 to i64 + %arrayidx19 = getelementptr inbounds double, double* %14, i64 %idxprom18 + store double %13, double* %arrayidx19, align 8 + %16 = load double*, double** %D.addr, align 8 + %17 = load i32, i32* %j, align 4 + %idxprom20 = sext i32 %17 to i64 + %arrayidx21 = getelementptr inbounds double, double* %16, i64 %idxprom20 + %18 = load double, double* %arrayidx21, align 8 + %19 = load double*, double** %C.addr, align 8 + %20 = load i32, i32* %j, align 4 + %idxprom22 = sext i32 %20 to i64 + %arrayidx23 = getelementptr inbounds double, double* %19, i64 %idxprom22 + store double %18, double* %arrayidx23, align 8 + br label %for.inc24 +for.inc24: ; preds = %for.end15 + %21 = load i32, i32* %j, align 4 + %inc25 = add nsw i32 %21, 1 + store i32 %inc25, i32* %j, align 4 + br label %for.cond +for.end26: ; preds = %for.cond + ret void +} + Index: test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll =================================================================== --- /dev/null +++ test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll @@ -0,0 +1,117 @@ +; RUN: opt %loadPolly -polly-canonicalize -polly-mse -analyze < %s | FileCheck %s +; +; Verify that the accesses are correctly expanded +; +; Original source code : +; +; #define Ni 2000 +; #define Nj 3000 +; +; void mse(double A[Ni], double B[Nj], double C[Nj], double D[Nj]) { +; int i,j; +; for (j = 0; j < Nj; j++) { +; for (int i = 0; i MemRef_B_Stmt_for_body4_expanded[i0, i1] }; +; CHECK: new: { Stmt_for_body4[i0, i1] -> MemRef_D_Stmt_for_body4_expanded[i0, i1] }; +; CHECK: new: { Stmt_for_end[i0] -> MemRef_B_Stmt_for_body4_expanded[i0, i0] }; +; CHECK: new: { Stmt_for_end[i0] -> MemRef_A_Stmt_for_end_expanded[i0] }; +; CHECK: new: { Stmt_for_end[i0] -> MemRef_D_Stmt_for_body4_expanded[i0, i0] }; +; CHECK: new: { Stmt_for_end[i0] -> MemRef_C_Stmt_for_end_expanded[i0] }; +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @mse(double* %A, double* %B, double* %C, double* %D) { +entry: + %A.addr = alloca double*, align 8 + %B.addr = alloca double*, align 8 + %C.addr = alloca double*, align 8 + %D.addr = alloca double*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %i1 = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store double* %B, double** %B.addr, align 8 + store double* %C, double** %C.addr, align 8 + store double* %D, double** %D.addr, align 8 + store i32 0, i32* %j, align 4 + br label %for.cond +for.cond: ; preds = %for.inc16, %entry + %0 = load i32, i32* %j, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end18 +for.body: ; preds = %for.cond + store i32 0, i32* %i1, align 4 + br label %for.cond2 +for.cond2: ; preds = %for.inc, %for.body + %1 = load i32, i32* %i1, align 4 + %cmp3 = icmp slt i32 %1, 10000 + br i1 %cmp3, label %for.body4, label %for.end +for.body4: ; preds = %for.cond2 + %2 = load i32, i32* %i1, align 4 + %conv = sitofp i32 %2 to double + %3 = load double*, double** %B.addr, align 8 + %4 = load i32, i32* %i1, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds double, double* %3, i64 %idxprom + store double %conv, double* %arrayidx, align 8 + %5 = load i32, i32* %i1, align 4 + %conv5 = sitofp i32 %5 to double + %6 = load double*, double** %D.addr, align 8 + %7 = load i32, i32* %i1, align 4 + %idxprom6 = sext i32 %7 to i64 + %arrayidx7 = getelementptr inbounds double, double* %6, i64 %idxprom6 + store double %conv5, double* %arrayidx7, align 8 + br label %for.inc +for.inc: ; preds = %for.body4 + %8 = load i32, i32* %i1, align 4 + %inc = add nsw i32 %8, 1 + store i32 %inc, i32* %i1, align 4 + br label %for.cond2 +for.end: ; preds = %for.cond2 + %9 = load double*, double** %B.addr, align 8 + %10 = load i32, i32* %j, align 4 + %idxprom8 = sext i32 %10 to i64 + %arrayidx9 = getelementptr inbounds double, double* %9, i64 %idxprom8 + %11 = load double, double* %arrayidx9, align 8 + %12 = load double*, double** %A.addr, align 8 + %13 = load i32, i32* %j, align 4 + %idxprom10 = sext i32 %13 to i64 + %arrayidx11 = getelementptr inbounds double, double* %12, i64 %idxprom10 + store double %11, double* %arrayidx11, align 8 + %14 = load double*, double** %D.addr, align 8 + %15 = load i32, i32* %j, align 4 + %idxprom12 = sext i32 %15 to i64 + %arrayidx13 = getelementptr inbounds double, double* %14, i64 %idxprom12 + %16 = load double, double* %arrayidx13, align 8 + %17 = load double*, double** %C.addr, align 8 + %18 = load i32, i32* %j, align 4 + %idxprom14 = sext i32 %18 to i64 + %arrayidx15 = getelementptr inbounds double, double* %17, i64 %idxprom14 + store double %16, double* %arrayidx15, align 8 + br label %for.inc16 +for.inc16: ; preds = %for.end + %19 = load i32, i32* %j, align 4 + %inc17 = add nsw i32 %19, 1 + store i32 %inc17, i32* %j, align 4 + br label %for.cond +for.end18: ; preds = %for.cond + ret void +}