diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2193,14 +2193,6 @@ return false; } - if (Hints.getInterleave() > 1) { - // TODO: Interleave support is future work. - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " - "outer loops.\n"); - Hints.emitRemarkWithHints(); - return false; - } - return true; } @@ -4159,13 +4151,15 @@ VPWidenPHIRecipe *VPPhi = dyn_cast(&P); if (!VPPhi) continue; - PHINode *NewPhi = cast(State.get(VPPhi, 0)); - // Make sure the builder has a valid insert point. - Builder.SetInsertPoint(NewPhi); - for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { - VPValue *Inc = VPPhi->getIncomingValue(i); - VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); - NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); + + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + PHINode *NewPhi = cast(State.get(VPPhi, Part)); + Builder.SetInsertPoint(NewPhi); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, Part), State.CFG.VPBB2IRBB[VPBB]); + } } } } @@ -9843,6 +9837,12 @@ CM.collectElementTypesForWidening(); + // The VPlan-native path does not have a cost model, so the only way to get + // a unroll factor is to query the loop vectorization hints. + unsigned UF = Hints.getInterleave(); + if (!UF) + UF = 1; + // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -9858,10 +9858,10 @@ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + VF.Width, UF, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.executePlan(VF.Width, UF, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1332,10 +1332,12 @@ StartIdx = I; } } - Value *Op0 = State.get(getOperand(StartIdx), 0); - Type *VecTy = Op0->getType(); - Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); - State.set(this, VecPhi, 0); + + Type *VecTy = State.get(getOperand(StartIdx), 0)->getType(); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); + State.set(this, VecPhi, Part); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll --- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -117,13 +117,11 @@ } ; Case 3: Annotated outer loop WITH vector width and interleave information -; doesn't have to be collected. +; has to be collected. ; CHECK-LABEL: case3 -; CHECK-NOT: LV: Loop hints: force=enabled -; CHECK-NOT: LV: We can vectorize this outer loop! -; CHECK: LV: Loop hints: force=? -; CHECK: LV: Found a loop: inner.body +; CHECK: LV: Loop hints: force=enabled width=4 interleave=2 +; CHECK: LV: We can vectorize this outer loop! define void @case3(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { entry: diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_unroll.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -force-vector-width=4 -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s + +@A = external local_unnamed_addr global [1024 x float], align 4 +@B = external local_unnamed_addr global [512 x float], align 4 + +; Test if the vplan-native-path successfully unrolls/interleaves the outer loop if requested via hints. +define void @foo() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH9:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[OUTER_LOOP_LATCH9]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x float> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> , <4 x float> poison) +; CHECK-NEXT: br label [[INNER_LOOP3:%.*]] +; CHECK: inner_loop3: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[INNER_LOOP3]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP7:%.*]], [[INNER_LOOP3]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[INNER_LOOP3]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x float> [ [[WIDE_MASKED_GATHER2]], [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNER_LOOP3]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <4 x i64> [[VEC_PHI4]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> , <4 x float> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> , <4 x float> poison) +; CHECK-NEXT: [[TMP4]] = fmul <4 x float> [[VEC_PHI5]], [[WIDE_MASKED_GATHER7]] +; CHECK-NEXT: [[TMP5]] = fmul <4 x float> [[VEC_PHI6]], [[WIDE_MASKED_GATHER8]] +; CHECK-NEXT: [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP7]] = add nuw nsw <4 x i64> [[VEC_PHI4]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP7]], +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[OUTER_LOOP_LATCH9]], label [[INNER_LOOP3]] +; CHECK: outer_loop_latch9: +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP4]], [[INNER_LOOP3]] ] +; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP5]], [[INNER_LOOP3]] ] +; CHECK-NEXT: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[VEC_PHI10]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[VEC_PHI11]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[TMP11]], +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[TMP12]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECK: outer_loop: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[OUTER_LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 [[I]] +; CHECK-NEXT: [[X_START:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: br label [[INNER_LOOP:%.*]] +; CHECK: inner_loop: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, [[OUTER_LOOP]] ], [ [[J_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECK-NEXT: [[X:%.*]] = phi float [ [[X_START]], [[OUTER_LOOP]] ], [ [[X_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, i64 [[J]] +; CHECK-NEXT: [[B:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[X_NEXT]] = fmul float [[X]], [[B]] +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], 512 +; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label [[OUTER_LOOP_LATCH]], label [[INNER_LOOP]] +; CHECK: outer_loop_latch: +; CHECK-NEXT: [[X_NEXT_LCSSA:%.*]] = phi float [ [[X_NEXT]], [[INNER_LOOP]] ] +; CHECK-NEXT: store float [[X_NEXT_LCSSA]], ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[OUTER_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], 1024 +; CHECK-NEXT: br i1 [[OUTER_EXITCOND]], label [[EXIT]], label [[OUTER_LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %outer_loop + +outer_loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %outer_loop_latch ] + %arrayidx1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %i + %x.start = load float, ptr %arrayidx1, align 4 + br label %inner_loop + +inner_loop: + %j = phi i64 [ 0, %outer_loop ], [ %j.next, %inner_loop ] + %x = phi float [ %x.start, %outer_loop ], [ %x.next, %inner_loop ] + %arrayidx2 = getelementptr inbounds [512 x float], ptr @B, i64 0, i64 %j + %b = load float, ptr %arrayidx2, align 4 + %x.next = fmul float %x, %b + %j.next = add nuw nsw i64 %j, 1 + %inner_exitcond = icmp eq i64 %j.next, 512 + br i1 %inner_exitcond, label %outer_loop_latch, label %inner_loop + +outer_loop_latch: + store float %x.next, ptr %arrayidx1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %outer_exitcond = icmp eq i64 %i.next, 1024 + br i1 %outer_exitcond, label %exit, label %outer_loop, !llvm.loop !1 + +exit: + ret void +} + +!1 = distinct !{!1, !2, !3, !4} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} +!3 = !{!"llvm.loop.vectorize.width", i32 4} +!4 = !{!"llvm.loop.interleave.count", i32 2}