diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -444,8 +444,9 @@ /// Holds the widest induction type encountered. Type *WidestIndTy = nullptr; - /// Allowed outside users. This holds the induction and reduction - /// vars which can be accessed from outside the loop. + /// Allowed outside users. This holds the variables that can be accessed from + /// outside the loop (including induction and reduction vars and non-header + /// phis). SmallPtrSet AllowedExit; /// Can we assume the absence of NaNs. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -595,6 +595,7 @@ // Unsafe cyclic dependencies with header phis are identified during // legalization for reduction, induction and first order // recurrences. + AllowedExit.insert(&I); continue; } diff --git a/llvm/test/Transforms/LoopVectorize/lv-fold-tail-by-masking-bug.ll b/llvm/test/Transforms/LoopVectorize/lv-fold-tail-by-masking-bug.ll --- a/llvm/test/Transforms/LoopVectorize/lv-fold-tail-by-masking-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/lv-fold-tail-by-masking-bug.ll @@ -32,41 +32,15 @@ ; build-all/bin/clang -O1 lv-bug.c -fvectorize && ./a.out ; the result is "BAD!" - -; FIXME: The result here is incorrect! See https://bugs.llvm.org/show_bug.cgi?id=43166 +; Used to miscompile (with clang 8.0.0), now we get +; loop not vectorized: Cannot fold tail by masking in the presence of live outs. define i64 @test1(i64 %y) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i64> , [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[INDUCTION]], -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP1]], -; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> , <4 x i64> [[TMP2]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 -; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: [[DIV:%.*]] = xor i64 3, [[Y]] @@ -75,9 +49,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -102,40 +76,15 @@ ret i64 %cond } - -; FIXME: The result here is incorrect! See https://bugs.llvm.org/show_bug.cgi?id=43166 +; Used to miscompile (with clang 8.0.0), now we get +; loop not vectorized: Cannot fold tail by masking in the presence of live outs. define i64 @test2(i64 %y) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[INDUCTION]], -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> , <4 x i64> -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 -; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: br label [[COND_END]] @@ -143,9 +92,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -169,38 +118,15 @@ ret i64 %cond } - -; FIXME: The result here is incorrect! See https://bugs.llvm.org/show_bug.cgi?id=43166 +; Used to miscompile (with clang 8.0.0), now we get +; loop not vectorized: Cannot fold tail by masking in the presence of live outs. define i32 @test3(i64 %y) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 -; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: br label [[COND_END]] @@ -208,9 +134,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ] ; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; entry: