diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -965,11 +965,14 @@ SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; + // The Previous value having a NonLocal memory dependence implies that all + // memory instructions can be sunk past it. + bool CanSinkMemoryInstrs = MDR->getDependency(Previous).isNonLocal(); + // Ensure every user of the phi node (recursively) is dominated by the // previous value. The dominance requirement ensures the loop vectorizer will // not need to vectorize the initial value prior to the first iteration of the // loop. - // TODO: Consider extending this sinking to handle memory instructions. // We optimistically assume we can sink all users after Previous. Keep a set // of instructions to sink after Previous ordered by dominance in the common @@ -996,9 +999,11 @@ SinkCandidate)) // We already are good w/o sinking. return true; - if (SinkCandidate->getParent() != PhiBB || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator()) + if (SinkCandidate->getParent() != PhiBB || SinkCandidate->isTerminator()) + return false; + + if (!CanSinkMemoryInstrs && (SinkCandidate->mayHaveSideEffects() || + SinkCandidate->mayReadFromMemory())) return false; // Avoid sinking an instruction multiple times (if multiple operands are diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll @@ -221,3 +221,102 @@ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +; Test that LoopVectorize is not inhibited by the presence of a[i+1] +; after the store to w[i]. +; +; Testcase derived from the following C: +; +; double w[1024], x[1024], a[1024]; +; void f() { +; for (unsigned i = 0; i < 1024-1; i++) { +; w[i] += a[i]; +; x[i] += a[i+1]; +; } +; } +; +define void @fixedorder_memory_dependencies(ptr noalias %w, ptr noalias %x, ptr %a) { +; CHECK-LABEL: @fixedorder_memory_dependencies( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_PRE:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x double> poison, double [[A_PRE]], i32 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x double], ptr [[A]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x double], ptr [[W:%.*]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x double], ptr [[X:%.*]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1]] = load <2 x double>, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[VECTOR_RECUR]], <2 x double> [[WIDE_LOAD1]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[WIDE_LOAD]], [[TMP7]] +; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[WIDE_LOAD2]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1022 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, 1022 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x double> [[WIDE_LOAD1]], i32 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x double> [[WIDE_LOAD1]], i32 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[A_PRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[A_IDX_NEXT:%.*]] = getelementptr inbounds [1024 x double], ptr [[A]], i64 0, i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[W_IDX:%.*]] = getelementptr inbounds [1024 x double], ptr [[W]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[X_IDX:%.*]] = getelementptr inbounds [1024 x double], ptr [[X]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[W_LOAD:%.*]] = load double, ptr [[W_IDX]], align 8 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[W_LOAD]], [[SCALAR_RECUR]] +; CHECK-NEXT: store double [[ADD_0]], ptr [[W_IDX]], align 8 +; CHECK-NEXT: [[A_NEXT]] = load double, ptr [[A_IDX_NEXT]], align 8 +; CHECK-NEXT: [[X_LOAD:%.*]] = load double, ptr [[X_IDX]], align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[X_LOAD]], [[A_NEXT]] +; CHECK-NEXT: store double [[ADD_1]], ptr [[X_IDX]], align 8 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1023 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; +entry: + %a_pre = load double, ptr %a + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %a_via_phi = phi double [ %a_pre, %entry ], [ %a_next, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %a_idx_next = getelementptr inbounds [1024 x double], ptr %a, i64 0, i64 %indvars.iv.next + %w_idx = getelementptr inbounds [1024 x double], ptr %w, i64 0, i64 %indvars.iv + %x_idx = getelementptr inbounds [1024 x double], ptr %x, i64 0, i64 %indvars.iv + %w_load = load double, ptr %w_idx + %add_0 = fadd fast double %w_load, %a_via_phi + + ; LV needs to be able to reason that this store can move past the next load. + store double %add_0, ptr %w_idx + %a_next = load double, ptr %a_idx_next + + %x_load = load double, ptr %x_idx + %add_1 = fadd fast double %x_load, %a_next + store double %add_1, ptr %x_idx + %exitcond.not = icmp eq i64 %indvars.iv.next, 1023 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +}