Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -413,6 +413,24 @@ /// from SCEV or creates a new using SCEVExpander. virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step); + /// Compute a step vector like the above functions, but scalarize the + /// arithmetic instead. The results of the computation are inserted into a + /// new vector with VF elements. Step is a value. + Value *getScalarizedStepVector(Value *Start, int Index, Value *Step, + unsigned VF); + + /// Compute a step vector like the above functions, but scalarize the + /// arithmetic instead. The results of the computation are inserted into a + /// new vector with VF elements. Step is a SCEV, and we use a SCEVExpander to + /// create a value from it. + Value *getScalarizedStepVector(Value *Start, int Index, const SCEV *Step, + unsigned VF); + + /// Determine if an induction variable is only used for counting loop + /// iterations or calculating addresses. Induction variables without + /// interesting users (e.g, loads and stores) should not be widened. + bool isIVOnlyUsedForAddrCalc(PHINode *Phi); + /// Create a vector induction variable based on an existing scalar one. /// Currently only works for integer induction variables with a constant /// step. @@ -2155,6 +2173,40 @@ return Builder.CreateAdd(Val, Step, "induction"); } +Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Start, int Index, + const SCEV *StepSCEV, + unsigned VF) { + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + auto *StepValue = Exp.expandCodeFor(StepSCEV, StepSCEV->getType(), + &*Builder.GetInsertPoint()); + return getScalarizedStepVector(Start, Index, StepValue, VF); +} + +Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Start, int Index, + Value *Step, unsigned VF) { + + // We can't create a vector with less than two elements. + assert(VF > 1 && "VF should be greater than one"); + + // Get the start value type and ensure it and the step value have the same + // integer type. + Type *StartTy = Start->getType()->getScalarType(); + assert(StartTy->isIntegerTy() && StartTy == Step->getType() && + "Start and Step should have the same integer type"); + + // Compute the scalarized step vector. We perform scalar arithmetic and then + // insert the results into the step vector. + Value *StepVector = UndefValue::get(ToVectorTy(StartTy, VF)); + for (unsigned I = 0; I < VF; ++I) { + auto *Mul = Builder.CreateMul(ConstantInt::get(StartTy, Index + I), Step); + auto *Add = Builder.CreateAdd(Start, Mul); + StepVector = Builder.CreateInsertElement(StepVector, Add, I); + } + + return StepVector; +} + int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); auto *SE = PSE.getSE(); @@ -4085,6 +4137,46 @@ return BlockMask; } +bool InnerLoopVectorizer::isIVOnlyUsedForAddrCalc(PHINode *Phi) { + + // The value of the induction variable corresponding to the loop latch. + auto *LatchValue = Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); + + // Look at the users of the induction variable inside the loop, and determine + // if any of them are interesting. If we find interesting users (e.g., loads + // and stores), the induction variable should be widened. If the induction + // variable is only used for counting loop iterations or calculating + // addresses, it shouldn't be widened. + auto HasInterestingUser = false; + for (auto I = Phi->user_begin(), E = Phi->user_end(); + I != E && !HasInterestingUser; ++I) { + auto *PhiUser = cast(*I); + + // Ignore the latch value. + if (PhiUser == LatchValue) + continue; + + // Ignore values not in the loop (induction variables are allowed to have + // users outside the loop). + if (!OrigLoop->contains(PhiUser)) + continue; + + // Determine if the user is interesting or not. We currently assume all + // users are interesting except getelementptr instructions, which are + // scalarized. + switch (PhiUser->getOpcode()) { + default: + HasInterestingUser = true; + case Instruction::GetElementPtr: + break; + } + } + + // Return whether or not we found any interesting users of the induction + // variable. + return !HasInterestingUser; +} + void InnerLoopVectorizer::widenPHIInstruction( Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV) { @@ -4153,7 +4245,7 @@ case InductionDescriptor::IK_IntInduction: { assert(P->getType() == II.getStartValue()->getType() && "Types must match"); if (VF == 1 || P->getType() != Induction->getType() || - !II.getConstIntStepValue()) { + !II.getConstIntStepValue() || isIVOnlyUsedForAddrCalc(P)) { Value *V = Induction; // Handle other induction variables that are now based on the // canonical one. @@ -4162,6 +4254,16 @@ V = II.transform(Builder, V, PSE.getSE(), DL); V->setName("offset.idx"); } + + // If an induction variable is only used for counting loop iterations or + // calculating addresses, it shouldn't be widened. Scalarize the step + // vector to give InstCombine a better chance of simplifying it. + if (VF > 1 && isIVOnlyUsedForAddrCalc(P)) { + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getScalarizedStepVector(V, VF * part, II.getStep(), VF); + return; + } + Value *Broadcasted = getBroadcastInstrs(V); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. @@ -4345,11 +4447,24 @@ Legal->getInductionVars()->lookup(OldInduction); if (auto StepValue = II.getConstIntStepValue()) { IntegerType *TruncType = cast(CI->getType()); - if (VF == 1) { + if (VF == 1 || isIVOnlyUsedForAddrCalc(OldInduction)) { StepValue = ConstantInt::getSigned(TruncType, StepValue->getSExtValue()); Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); + + // If an induction variable is only used for counting loop + // iterations or calculating addresses, it shouldn't be widened. + // Scalarize the step vector to give InstCombine a better chance of + // simplifying it. + if (VF > 1 && isIVOnlyUsedForAddrCalc(OldInduction)) { + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getScalarizedStepVector(ScalarCast, VF * Part, + StepValue, VF); + addMetadata(Entry, &*it); + break; + } + Value *Broadcasted = getBroadcastInstrs(ScalarCast); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue); Index: test/Transforms/LoopVectorize/gep_with_bitcast.ll =================================================================== --- test/Transforms/LoopVectorize/gep_with_bitcast.ll +++ test/Transforms/LoopVectorize/gep_with_bitcast.ll @@ -12,11 +12,11 @@ ; CHECK-LABEL: @foo ; CHECK: vector.body -; CHECK: %0 = phi -; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0 -; CHECK: %3 = bitcast double** %2 to <4 x i64>* -; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8 -; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer +; CHECK: %[[IV:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %[[v0:.+]] = getelementptr inbounds double*, double** %in, i64 %[[IV]] +; CHECK: %[[v1:.+]] = bitcast double** %[[v0]] to <4 x i64>* +; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %[[v1]], align 8 +; CHECK: icmp eq <4 x i64> %wide.load, zeroinitializer ; CHECK: br i1 define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 { Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -66,6 +66,44 @@ ret void } +; Make sure we don't widen induction variables that don't have interesting +; users inside the loop. Doing so causes us to create unnecessary phi nodes and +; computation inside the loop. +; +; for (int i = 0; i < n; ++i) +; sum += a[i]; +; +; IND-LABEL: @scalarize_induction_variable( +; IND: vector.body: +; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; IND-NOT: add i64 {{.*}}, 2 +; IND: getelementptr inbounds i64, i64* %a, i64 %index +; +; UNROLL-LABEL: @scalarize_induction_variable( +; UNROLL: vector.body: +; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL-NOT: add i64 {{.*}}, 4 +; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index +; UNROLL: getelementptr i64, i64* %[[g1]], i64 2 + +define i64 @scalarize_induction_variable(i64 *%a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %sum = phi i64 [ %2, %for.body ], [ 0, %entry ] + %0 = getelementptr inbounds i64, i64* %a, i64 %i + %1 = load i64, i64* %0, align 8 + %2 = add i64 %1, %sum + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i64 [ %2, %for.body ] + ret i64 %3 +} ; Make sure that the loop exit count computation does not overflow for i8 and ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the @@ -114,9 +152,11 @@ ; CHECK-LABEL: max_i32_backedgetaken ; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked +; CHECK: middle.block: +; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0 ; CHECK: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ] -; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ] +; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ] define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { Index: test/Transforms/LoopVectorize/iv_outside_user.ll =================================================================== --- test/Transforms/LoopVectorize/iv_outside_user.ll +++ test/Transforms/LoopVectorize/iv_outside_user.ll @@ -22,8 +22,8 @@ ; CHECK-LABEL: @preinc ; CHECK-LABEL: middle.block: -; CHECK: %3 = sub i32 %n.vec, 1 -; CHECK: %ind.escape = add i32 0, %3 +; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1 +; CHECK: %ind.escape = add i32 0, %[[v3]] ; CHECK-LABEL: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] ; CHECK-LABEL: for.end: Index: test/Transforms/LoopVectorize/reverse_induction.ll =================================================================== --- test/Transforms/LoopVectorize/reverse_induction.ll +++ test/Transforms/LoopVectorize/reverse_induction.ll @@ -5,9 +5,24 @@ ; Make sure consecutive vector generates correct negative indices. ; PR15882 -; CHECK-LABEL: @reverse_induction_i64( -; CHECK: %step.add = add <4 x i64> %vec.ind, -; CHECK: %step.add2 = add <4 x i64> %step.add, +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i64 %startval, %index +; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) { entry: @@ -30,8 +45,25 @@ } ; CHECK-LABEL: @reverse_induction_i128( -; CHECK: %step.add = add <4 x i128> %vec.ind, -; CHECK: %step.add2 = add <4 x i128> %step.add, +; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i128 %startval, %index +; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3 + define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) { entry: br label %for.body @@ -53,8 +85,24 @@ } ; CHECK-LABEL: @reverse_induction_i16( -; CHECK: add <4 x i16> %[[SPLAT:.*]], -; CHECK: add <4 x i16> %[[SPLAT]], +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i16 %startval, {{.*}} +; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) { entry: