Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -542,9 +542,9 @@ /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. - /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it - /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + /// Note that \p EntryVal doesn't have to be an induction variable - it + /// can also be a truncate instruction. + void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID); /// Create a vector induction phi node based on an existing scalar one. \p @@ -571,10 +571,20 @@ /// vector loop for both the Phi and the cast. /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, - Value *VectorLoopValue, - unsigned Part, - unsigned Lane = UINT_MAX); + /// + /// \p EntryVal is the value from the original loop that maps to the vector + /// phi node and is used to distinguish what is the IV currently being + /// processed - original one (if \p EntryVal is a phi corresponding to the + /// original IV) or the "newly-created" one based on the proof mentioned above + /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the + /// latter case \p EntryVal is a TruncInst and we must not record anything for + /// that IV, but it's error-prone to expect callers of this routine to care + /// about that, hence this explicit parameter. + void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, + const Instruction *EntryVal, + Value *VectorLoopValue, + unsigned Part, + unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -2368,6 +2378,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + assert((isa(EntryVal) || isa(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); Value *Start = II.getStartValue(); // Construct the initial value of the vector IV in the vector loop preheader @@ -2421,8 +2433,7 @@ if (isa(EntryVal)) addMetadata(LastInduction, EntryVal); - else - recordVectorLoopValueForInductionCast(II, LastInduction, Part); + recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); LastInduction = cast(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); @@ -2456,8 +2467,20 @@ } void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, - unsigned Lane) { + const InductionDescriptor &ID, const Instruction *EntryVal, + Value *VectorLoopVal, unsigned Part, unsigned Lane) { + assert((isa(EntryVal) || isa(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // This induction variable is not the phi from the original loop but the + // newly-created IV based on the proof that casted Phi is equal to the + // uncasted Phi in the vectorized loop (under a runtime guard possibly). It + // re-uses the same InductionDescriptor that original IV uses but we don't + // have to do any recording in this case - that is done when original IV is + // processed. + if (isa(EntryVal)) + return; + const SmallVectorImpl &Casts = ID.getCastInsts(); if (Casts.empty()) return; @@ -2554,8 +2577,7 @@ VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); - else - recordVectorLoopValueForInductionCast(ID, EntryPart, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); } } @@ -2626,7 +2648,7 @@ } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal, + Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); @@ -2661,7 +2683,7 @@ auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); - recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); } } } Index: llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -scev-version-unknown -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" + +define void @foo() { +; CHECK-LABEL: @foo( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; +entry: + br label %loop + +loop: + %0 = phi i64 [ 2, %entry ], [ %3, %loop ] + %1 = and i64 %0, 4294967295 + %2 = trunc i64 %0 to i32 + %3 = add nuw nsw i64 %1, 1 + %4 = icmp sgt i32 %2, 80 + br i1 %4, label %exit, label %loop + +exit: + ret void +}