Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -422,6 +422,14 @@ /// from SCEV or creates a new using SCEVExpander. virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step); + /// Create a vector induction variable based on an existing scalar one. + /// Currently only works for integer primary induction variables with + /// a constant step. + /// If TruncType is provided, instead of widening the original IV, we + /// widen a version of the IV truncated to TruncType. + void widenInductionVariable(const InductionDescriptor &II, VectorParts &Entry, + IntegerType *TruncType = nullptr); + /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. /// When we widen (vectorize) values we place them in the map. If the values @@ -2099,6 +2107,40 @@ return getStepVector(Val, StartIdx, StepValue); } +void InnerLoopVectorizer::widenInductionVariable(const InductionDescriptor &II, + VectorParts &Entry, + IntegerType *TruncType) { + Value *Start = II.getStartValue(); + ConstantInt *Step = II.getConstIntStepValue(); + assert(Step && "Can not widen an IV with a non-constant step"); + + // Construct the initial value of the vector IV in the vector loop preheader + auto CurrIP = Builder.saveIP(); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + if (TruncType) { + Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); + Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); + } + Value *SplatStart = Builder.CreateVectorSplat(VF, Start); + Value *SteppedStart = getStepVector(SplatStart, 0, Step); + Builder.restoreIP(CurrIP); + + Value *SplatVF = + ConstantVector::getSplat(VF, ConstantInt::get(Start->getType(), VF)); + // We may need to add the step a number of times, depending on the unroll + // factor. The last of those goes into the PHI. + PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", + &*LoopVectorBody->getFirstInsertionPt()); + Value *LastInduction = VecInd; + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = LastInduction; + LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add"); + } + + VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); + VecInd->addIncoming(LastInduction, LoopVectorBody); +} + Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); @@ -4056,19 +4098,25 @@ llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: { assert(P->getType() == II.getStartValue()->getType() && "Types must match"); - // Handle other induction variables that are now based on the - // canonical one. - Value *V = Induction; - if (P != OldInduction) { - V = Builder.CreateSExtOrTrunc(Induction, P->getType()); - V = II.transform(Builder, V, PSE.getSE(), DL); - V->setName("offset.idx"); - } - Value *Broadcasted = getBroadcastInstrs(V); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding 0, 1, 2, etc. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep()); + if (P != OldInduction || VF == 1) { + Value *V = Induction; + // Handle other induction variables that are now based on the + // canonical one. + if (P != OldInduction) { + V = Builder.CreateSExtOrTrunc(Induction, P->getType()); + V = II.transform(Builder, V, PSE.getSE(), DL); + V->setName("offset.idx"); + } + Value *Broadcasted = getBroadcastInstrs(V); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep()); + } else { + // Instead of re-creating the vector IV by splatting the scalar IV + // in each iteration, we can make a new independent vector IV. + widenInductionVariable(II, Entry); + } return; } case InductionDescriptor::IK_PtrInduction: @@ -4239,15 +4287,23 @@ if (CI->getOperand(0) == OldInduction && it->getOpcode() == Instruction::Trunc) { InductionDescriptor II = - Legal->getInductionVars()->lookup(OldInduction); + Legal->getInductionVars()->lookup(OldInduction); if (auto StepValue = II.getConstIntStepValue()) { - StepValue = ConstantInt::getSigned(cast(CI->getType()), - StepValue->getSExtValue()); - Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, - CI->getType()); - Value *Broadcasted = getBroadcastInstrs(ScalarCast); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue); + IntegerType *TruncType = cast(CI->getType()); + if (VF == 1) { + StepValue = + ConstantInt::getSigned(TruncType, StepValue->getSExtValue()); + Value *ScalarCast = + Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); + Value *Broadcasted = getBroadcastInstrs(ScalarCast); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue); + } else { + // Truncating a vector induction variable on each iteration + // may be expensive. Instead, truncate the initial value, and create + // a new, truncated, vector IV based on that. + widenInductionVariable(II, Entry, TruncType); + } addMetadata(Entry, &*it); break; } Index: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll +++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll @@ -43,7 +43,7 @@ ; CHECK-LABEL: @s173 ; CHECK: load <4 x float>, <4 x float>* -; CHECK: add i64 %index, 16000 +; CHECK: add nsw i64 %.lhs, 16000 ; CHECK: ret i32 0 } Index: llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -95,7 +95,7 @@ %struct.In = type { float, float } ;AVX512-LABEL: @foo2 -;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 ;AVX512: llvm.masked.gather.v16f32 ;AVX512: llvm.masked.store.v16f32 ;AVX512: ret void @@ -170,10 +170,10 @@ ;} ;AVX512-LABEL: @foo3 -;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 ;AVX512: llvm.masked.gather.v16f32 ;AVX512: fadd <16 x float> -;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1 +;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1 ;AVX512: llvm.masked.scatter.v16f32 ;AVX512: ret void Index: llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll @@ -8,7 +8,7 @@ @a = common global [2048 x i32] zeroinitializer, align 16 ;CHECK-LABEL: @example12( -;CHECK: trunc i64 +;CHECK: %vec.ind1 = phi <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret void define void @example12() nounwind uwtable ssp { Index: llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll +++ llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll @@ -368,7 +368,7 @@ } ;CHECK-LABEL: @example12( -;CHECK: trunc i64 +;CHECK: %vec.ind1 = phi <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret void define void @example12() nounwind uwtable ssp { Index: llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll +++ llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll @@ -12,10 +12,11 @@ ; CHECK-LABEL: @foo ; CHECK: vector.body -; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index -; CHECK: %1 = bitcast double** %0 to <4 x i64>* -; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8 -; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer +; CHECK: %0 = phi +; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0 +; CHECK: %3 = bitcast double** %2 to <4 x i64>* +; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8 +; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer ; CHECK: br i1 define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 { @@ -37,4 +38,4 @@ for.end: ret void -} \ No newline at end of file +} Index: llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll +++ llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll @@ -12,7 +12,7 @@ @PA = external global i32* -;; === First, the tests that should always vectorize, wither statically or by adding run-time checks === +;; === First, the tests that should always vectorize, whether statically or by adding run-time checks === ; /// Different objects, positive induction, constant distance @@ -387,7 +387,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias08( -; CHECK: sub <4 x i32> +; CHECK: sub nuw nsw <4 x i32> ; CHECK: ret define i32 @noAlias08(i32 %a) #0 { @@ -439,7 +439,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias09( -; CHECK: sub <4 x i32> +; CHECK: sub nuw nsw <4 x i32> ; CHECK: ret define i32 @noAlias09(i32 %a) #0 { @@ -721,7 +721,7 @@ ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias14( -; CHECK: sub <4 x i32> +; CHECK: sub nuw nsw <4 x i32> ; CHECK: ret define i32 @noAlias14(i32 %a) #0 { Index: llvm/trunk/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll @@ -1,4 +1,6 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -27,8 +29,6 @@ ret void } -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND - ; Make sure we remove unneeded vectorization of induction variables. ; In order for instcombine to cleanup the vectorized induction variables that we ; create in the loop vectorizer we need to perform some form of redundancy @@ -241,3 +241,64 @@ exit: ret void } + +; Check that we generate vectorized IVs in the pre-header +; instead of widening the scalar IV inside the loop, when +; we know how to do that. +; IND-LABEL: veciv +; IND: vector.body: +; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; IND: %vec.ind = phi <2 x i32> [ , %vector.ph ], [ %step.add, %vector.body ] +; IND: %step.add = add <2 x i32> %vec.ind, +; IND: %index.next = add i32 %index, 2 +; IND: %[[CMP:.*]] = icmp eq i32 %index.next +; IND: br i1 %[[CMP]] +; UNROLL-LABEL: veciv +; UNROLL: vector.body: +; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL: %vec.ind = phi <2 x i32> [ , %vector.ph ], [ %step.add1, %vector.body ] +; UNROLL: %step.add = add <2 x i32> %vec.ind, +; UNROLL: %step.add1 = add <2 x i32> %vec.ind, +; UNROLL: %index.next = add i32 %index, 4 +; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next +; UNROLL: br i1 %[[CMP]] +define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv + store i32 %indvars.iv, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %k + br i1 %exitcond, label %exit, label %for.body + +exit: + ret void +} + +; IND-LABEL: trunciv +; IND: vector.body: +; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; IND: %[[VECIND:.*]] = phi <2 x i32> [ , %vector.ph ], [ %[[STEPADD:.*]], %vector.body ] +; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], +; IND: %index.next = add i64 %index, 2 +; IND: %[[CMP:.*]] = icmp eq i64 %index.next +; IND: br i1 %[[CMP]] +define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %trunc.iv = trunc i64 %indvars.iv to i32 + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv + store i32 %trunc.iv, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %k + br i1 %exitcond, label %exit, label %for.body + +exit: + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -6,8 +6,11 @@ @array = common global [1024 x i32] zeroinitializer, align 16 ;CHECK-LABEL: @array_at_plus_one( -;CHECK: add i64 %index, 12 -;CHECK: trunc i64 +;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +;CHECK: %vec.ind = phi <4 x i64> [ , %vector.ph ], [ %step.add, %vector.body ] +;CHECK: %vec.ind1 = phi <4 x i32> [ , %vector.ph ], [ %step.add2, %vector.body ] +;CHECK: add <4 x i64> %vec.ind, +;CHECK: add nsw <4 x i64> %vec.ind, ;CHECK: ret i32 define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0