Index: lib/Analysis/ScopBuilder.cpp
===================================================================
--- lib/Analysis/ScopBuilder.cpp
+++ lib/Analysis/ScopBuilder.cpp
@@ -102,6 +102,18 @@
     cl::desc("Disable multiplicative reductions"), cl::Hidden, cl::ZeroOrMore,
     cl::init(false), cl::cat(PollyCategory));
 
+enum class GranularityChoice { BasicBlocks, Stores };
+
+static cl::opt<GranularityChoice>
+    StmtGranularity("polly-stmt-granularity",
+                    cl::desc("Select the statement granularity algorithm"),
+                    cl::values(clEnumValN(GranularityChoice::BasicBlocks, "bb",
+                                          "Entire basic blocks granularity"),
+                               clEnumValN(GranularityChoice::Stores, "store",
+                                          "Store-level granularity")),
+                    cl::init(GranularityChoice::BasicBlocks),
+                    cl::cat(PollyCategory));
+
 void ScopBuilder::buildPHIAccesses(ScopStmt *PHIStmt, PHINode *PHI,
                                    Region *NonAffineSubRegion,
                                    bool IsExitBlock) {
@@ -686,7 +698,9 @@
         Loop *L = LI.getLoopFor(Inst.getParent());
         if (shouldModelInst(&Inst, L))
           Instructions.push_back(&Inst);
-        if (Inst.getMetadata("polly_split_after")) {
+        if (Inst.getMetadata("polly_split_after") ||
+            (StmtGranularity == GranularityChoice::Stores &&
+             isa<StoreInst>(Inst))) {
           Loop *SurroundingLoop = LI.getLoopFor(I->getNodeAs<BasicBlock>());
           scop->addScopStmt(I->getNodeAs<BasicBlock>(), SurroundingLoop,
                             Instructions, Count);
@@ -694,9 +708,11 @@
           Instructions.clear();
         }
       }
-      Loop *SurroundingLoop = LI.getLoopFor(I->getNodeAs<BasicBlock>());
-      scop->addScopStmt(I->getNodeAs<BasicBlock>(), SurroundingLoop,
-                        Instructions, Count);
+      if (Count == 0 || !Instructions.empty()) {
+        Loop *SurroundingLoop = LI.getLoopFor(I->getNodeAs<BasicBlock>());
+        scop->addScopStmt(I->getNodeAs<BasicBlock>(), SurroundingLoop,
+                          Instructions, Count);
+      }
     }
 }
 
@@ -720,9 +736,13 @@
       Split = false;
       Count++;
     }
-    if (Inst.getMetadata("polly_split_after"))
+    if (Inst.getMetadata("polly_split_after") ||
+        (StmtGranularity == GranularityChoice::Stores && isa<StoreInst>(Inst)))
       Split = true;
 
+    if (Count > scop->getStmtListFor(&BB).size() - 1)
+      break;
+
     if (Stmt && Stmt->isBlockStmt() && Stmt != scop->getStmtListFor(&BB)[Count])
       continue;
 
Index: test/Isl/CodeGen/non-affine-dominance-generated-entering-1.ll
===================================================================
--- /dev/null
+++ test/Isl/CodeGen/non-affine-dominance-generated-entering-1.ll
@@ -0,0 +1,44 @@
+; RUN: opt %loadPolly -polly-codegen -polly-stmt-granularity=store -S < %s | FileCheck %s
+;
+; llvm.org/PR25439
+; Scalar reloads in the generated entering block were not recognized as
+; dominating the subregion blocks when there were multiple entering nodes. This
+; resulted in values defined in there (here: %cond used in subregionB_entry) not
+; being copied. We check whether it is reusing the reloaded scalar.
+;
+; CHECK-LABEL: polly.stmt.subregionB_entry.exit:
+; CHECK:         store i1 %polly.cond, i1* %cond.s2a
+;
+; CHECK-LABEL: polly.stmt.subregionB_entry.entry:
+; CHECK:         %cond.s2a.reload = load i1, i1* %cond.s2a
+;
+; CHECK-LABEL: polly.stmt.subregionB_entry:
+; CHECK:         br i1 %cond.s2a.reload
+
+define void @func(i32* %A) {
+entry:
+  br label %subregionA_entry
+
+subregionA_entry:
+  %cond = phi i1 [ false, %entry ], [ true, %subregionB_exit ]
+  br i1 %cond, label %subregionA_if, label %subregionA_else
+
+subregionA_if:
+  br label %subregionB_entry
+
+subregionA_else:
+  br label %subregionB_entry
+
+subregionB_entry:
+  store i32 0, i32* %A
+  br i1 %cond, label %subregionB_if, label %subregionB_exit
+
+subregionB_if:
+  br label %subregionB_exit
+
+subregionB_exit:
+  br i1 false, label %subregionA_entry, label %return
+
+return:
+  ret void
+}
Index: test/ScopInfo/invariant_load_zext_parameter-3.ll
===================================================================
--- /dev/null
+++ test/ScopInfo/invariant_load_zext_parameter-3.ll
@@ -0,0 +1,112 @@
+; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-scops -polly-invariant-load-hoisting=true -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-codegen -polly-invariant-load-hoisting=true -polly-stmt-granularity=store -analyze < %s
+;
+; Stress test for the code generation of invariant accesses.
+;
+;    void f(int *I0, int *I1, int *I2, int *V, long p0, long p1, long p2, long p3) {
+;      *V = *I1;
+;      for (int i = 0; i < 1000; i++) {
+;        long n0 = p0 * *I1 + p1 * *I1;
+;        V[i] = I0[n0];
+;        long m0 = p0 * (I2[0]);
+;        long m1 = p1 * (I2[1]);
+;        long m2 = p2 * (I2[2]);
+;        long m3 = p3 * (I2[3]);
+;        int j = 0;
+;        do {
+;          if (j > 0) {
+;            V[i] += I1[m0 + m2];
+;            V[i] += I1[n0];
+;          }
+;        } while (j++ < m1 + m3 * n0);
+;      }
+;    }
+;
+; CHECK: p0: ((sext i32 %tmp6 to i64) * %p1)
+; CHECK: p1: ((sext i32 %tmp3 to i64) * (sext i32 %tmp8 to i64) * (%p0 + %p1) * %p3)
+; CHECK: p2: ((sext i32 %tmp3 to i64) * (%p0 + %p1))
+; CHECK: p3: ((sext i32 %tmp5 to i64) * %p0)
+; CHECK: p4: ((sext i32 %tmp7 to i64) * %p2)
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %I0, i32* %I1, i32* %I2, i32* %V, i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
+entry:
+  %tmp = load i32, i32* %I1, align 4
+  store i32 %tmp, i32* %V, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv1, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp3 = load i32, i32* %I1, align 4
+  %conv = sext i32 %tmp3 to i64
+  %mul = mul nsw i64 %conv, %p0
+  %conv1 = sext i32 %tmp3 to i64
+  %mul2 = mul nsw i64 %conv1, %p1
+  %add = add nsw i64 %mul, %mul2
+  %arrayidx = getelementptr inbounds i32, i32* %I0, i64 %add
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %V, i64 %indvars.iv1
+  store i32 %tmp4, i32* %arrayidx3, align 4
+  %tmp5 = load i32, i32* %I2, align 4
+  %conv5 = sext i32 %tmp5 to i64
+  %mul6 = mul nsw i64 %conv5, %p0
+  %arrayidx7 = getelementptr inbounds i32, i32* %I2, i64 1
+  %tmp6 = load i32, i32* %arrayidx7, align 4
+  %conv8 = sext i32 %tmp6 to i64
+  %mul9 = mul nsw i64 %conv8, %p1
+  %arrayidx10 = getelementptr inbounds i32, i32* %I2, i64 2
+  %tmp7 = load i32, i32* %arrayidx10, align 4
+  %conv11 = sext i32 %tmp7 to i64
+  %mul12 = mul nsw i64 %conv11, %p2
+  %arrayidx13 = getelementptr inbounds i32, i32* %I2, i64 3
+  %tmp8 = load i32, i32* %arrayidx13, align 4
+  %conv14 = sext i32 %tmp8 to i64
+  %mul15 = mul nsw i64 %conv14, %p3
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ 0, %for.body ]
+  %cmp16 = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp16, label %if.then, label %if.end
+
+if.then:                                          ; preds = %do.body
+  %add18 = add nsw i64 %mul6, %mul12
+  %arrayidx19 = getelementptr inbounds i32, i32* %I1, i64 %add18
+  %tmp9 = load i32, i32* %arrayidx19, align 4
+  %arrayidx21 = getelementptr inbounds i32, i32* %V, i64 %indvars.iv1
+  %tmp10 = load i32, i32* %arrayidx21, align 4
+  %add22 = add nsw i32 %tmp10, %tmp9
+  store i32 %add22, i32* %arrayidx21, align 4
+  %arrayidx23 = getelementptr inbounds i32, i32* %I1, i64 %add
+  %tmp11 = load i32, i32* %arrayidx23, align 4
+  %arrayidx25 = getelementptr inbounds i32, i32* %V, i64 %indvars.iv1
+  %tmp12 = load i32, i32* %arrayidx25, align 4
+  %add26 = add nsw i32 %tmp12, %tmp11
+  store i32 %add26, i32* %arrayidx25, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %do.body
+  br label %do.cond
+
+do.cond:                                          ; preds = %if.end
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul28 = mul nsw i64 %mul15, %add
+  %add29 = add nsw i64 %mul9, %mul28
+  %cmp30 = icmp slt i64 %indvars.iv, %add29
+  br i1 %cmp30, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %do.end
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}