Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2014,6 +2014,42 @@ return WideningDecisions[InstOnVF].second; } + /// Return True if instruction \p I is an optimizable truncate whose operand + /// is an induction variable. Such a truncate will be removed by adding a new + /// induction variable with the destination type. + bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + + // If the instruction is not a truncate, return false. + auto *Trunc = dyn_cast(I); + if (!Trunc) + return false; + + // Get the source and destination types of the truncate. + Type *SrcTy = ToVectorTy(cast(I)->getSrcTy(), VF); + Type *DestTy = ToVectorTy(cast(I)->getDestTy(), VF); + + // If the truncate is free for the given types, return false. Replacing a + // free truncate with an induction variable would add an induction variable + // update instruction to each iteration of the loop. We exclude from this + // check the primary induction variable since it will need an update + // instruction regardless. + Value *Op = Trunc->getOperand(0); + if (Op != Legal->getInduction() && TTI.isTruncateFree(SrcTy, DestTy)) + return false; + + // If the truncated value is not an induction variable, return false. + if (!Legal->isInductionVariable(Op)) + return false; + + // Lastly, we only consider an induction variable truncate to be + // optimizable if it has a constant step. + // + // TODO: Expand optimizable truncates to include truncations of induction + // variables having loop-invariant steps. + auto ID = Legal->getInductionVars()->lookup(cast(Op)); + return ID.getConstIntStepValue(); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -4879,10 +4915,9 @@ // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and // (c) other casts depend on pointer size. - auto ID = Legal->getInductionVars()->lookup(OldInduction); - if (isa(CI) && CI->getOperand(0) == OldInduction && - ID.getConstIntStepValue()) { - widenIntInduction(OldInduction, cast(CI)); + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntInduction(cast(CI->getOperand(0)), + cast(CI)); break; } @@ -7224,12 +7259,14 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - // We optimize the truncation of induction variable. - // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && - Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + // We optimize the truncation of induction variables having constant + // integer steps. The cost of these truncations is the same as the scalar + // operation. + if (isOptimizableIVTruncate(I, VF)) { + auto *Trunc = cast(I); + return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), + Trunc->getSrcTy()); + } Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @non_primary_iv_trunc_free( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @non_primary_iv_trunc_free(i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %i.next = add nuw nsw i64 %i, 5 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll @@ -773,3 +773,34 @@ exit: ret void } + +; CHECK-LABEL: @non_primary_iv_trunc( +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi <2 x i32> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[TMP3:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: %index.next = add i64 %index, 2 +; CHECK: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +define void @non_primary_iv_trunc(i32* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = trunc i64 %j to i32 + store i32 %tmp1, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 1 + %j.next = add nuw nsw i64 %j, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/reverse_iter.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/reverse_iter.ll +++ llvm/trunk/test/Transforms/LoopVectorize/reverse_iter.ll @@ -2,7 +2,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; Make sure that the reverse iterators are calculated using 64bit arithmetic, not 32. +; PR15882: This test ensures that we do not produce wrapping arithmetic when +; creating constant reverse step vectors. ; ; int foo(int n, int *A) { ; int sum; @@ -13,7 +14,7 @@ ; ;CHECK-LABEL: @foo( -;CHECK: +;CHECK: ;CHECK: ret define i32 @foo(i32 %n, i32* nocapture %A) { %1 = icmp sgt i32 %n, 0