Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -356,11 +356,46 @@ // If we think Phi may have been type-promoted, we also need to ensure that // all source operands of the reduction are either SExtInsts or ZEstInsts. If // so, we will be able to evaluate the reduction in the narrower bit width. - if (Start != Phi) + if (Start != Phi) { if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, IsSigned, VisitedInsts, CastInsts)) return false; + // We need to be certain that we are dealing with type-promoted Phi. So we + // check that the value that goes outside the loop is only used by LCSSA + // Phi node which is either dead or truncated to the type we expect. + PHINode *LCSSAPhi = nullptr; + for (User *U : ExitInstruction->users()) { + Instruction *UI = cast(U); + if (!TheLoop->contains(UI->getParent())) { + // Two users outside the loop, we should not have it. + if (LCSSAPhi) + return false; + + PHINode *PN = dyn_cast(UI); + // LCSSA Phi node we are looking for is a Phi with 1 input. + if (!PN || PN->getNumOperands() != 1) + return false; + LCSSAPhi = PN; + } + } + + if (LCSSAPhi) { + // If the found LCSSA has uses, make sure that it is the only one use that + // is a trunc to a proper type. Otherwise what we are dealing with is not + // a type-promoted Phi node. + if (LCSSAPhi->hasNUsesOrMore(2)) + return false; + + if (LCSSAPhi->hasOneUse()) { + TruncInst *Trunc = + dyn_cast(LCSSAPhi->use_begin()->getUser()); + if (!Trunc || Trunc->getType() != RecurrenceType) + return false; + } + } + } + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. Index: test/Transforms/LoopVectorize/and-plus.ll =================================================================== --- test/Transforms/LoopVectorize/and-plus.ll +++ test/Transforms/LoopVectorize/and-plus.ll @@ -0,0 +1,109 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +; This should not be vectorized. Current algorithm tries to choose type i1 for +; vectorization because we apply and with 1 to the accumulator. But the result +; of this calculation does not fit into type i1. +; TODO: We vectorize this loop with type i32. For this, we need to be able to +; undo the changes made by lookThroughAnd after we find out that our idea that +; we are dealing with extended Phi of type i1 was wrong. +define i8 @test_01(i8 %c) #0 { + +; CHECK-LABEL: @test_01( +; CHECK-NOT: vector.body: +; CHECK-NOT: zext i1 {{.*}} to i8 + +entry: + br label %loop + +exit: ; preds = %loop + ret i8 %accum.plus + +loop: ; preds = %loop, %entry + %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ] + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %accum.and = and i8 %accum.phi, 1 + %accum.plus = add nuw nsw i8 %accum.and, 3 + %iv.next = add nuw nsw i32 %iv, 1 + %cond = icmp ugt i32 %iv, 191 + br i1 %cond, label %exit, label %loop +} + +; This can be vectorized with type i1 because the result is not used. +define void @test_02(i8 %c) #0 { + +; CHECK-LABEL: @test_02( +; CHECK: vector.body: +; CHECK: zext i1 {{.*}} to i8 + +entry: + br label %loop + +exit: ; preds = %loop + %lcssa = phi i8 [ %accum.plus, %loop ] + ret void + +loop: ; preds = %loop, %entry + %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ] + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %accum.and = and i8 %accum.phi, 1 + %accum.plus = add nuw nsw i8 %accum.and, 3 + %iv.next = add nuw nsw i32 %iv, 1 + %cond = icmp ugt i32 %iv, 191 + br i1 %cond, label %exit, label %loop +} + +; This can be vectorized with type i1 because the result is truncated properly. +define i1 @test_03(i8 %c) #0 { + +; CHECK-LABEL: @test_03( +; CHECK: vector.body: +; CHECK: zext i1 {{.*}} to i8 + +entry: + br label %loop + +exit: ; preds = %loop + %lcssa = phi i8 [ %accum.plus, %loop ] + %trunc = trunc i8 %lcssa to i1 + ret i1 %trunc + +loop: ; preds = %loop, %entry + %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ] + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %accum.and = and i8 %accum.phi, 1 + %accum.plus = add nuw nsw i8 %accum.and, 3 + %iv.next = add nuw nsw i32 %iv, 1 + %cond = icmp ugt i32 %iv, 191 + br i1 %cond, label %exit, label %loop +} + +; This cannot be vectorized with type i1 because the result is truncated to a +; wrong type. +; TODO: It can also be vectorized with type i32 (or maybe i4?) it we learn to +; undo lookThroughAnd. +define i4 @test_04(i8 %c) #0 { + +; CHECK-LABEL: @test_04( +; CHECK-NOT: vector.body: +; CHECK-NOT: zext i1 {{.*}} to i8 + +entry: + br label %loop + +exit: ; preds = %loop + %lcssa = phi i8 [ %accum.plus, %loop ] + %trunc = trunc i8 %lcssa to i4 + ret i4 %trunc + +loop: ; preds = %loop, %entry + %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ] + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %accum.and = and i8 %accum.phi, 1 + %accum.plus = add nuw nsw i8 %accum.and, 3 + %iv.next = add nuw nsw i32 %iv, 1 + %cond = icmp ugt i32 %iv, 191 + br i1 %cond, label %exit, label %loop +}