Index: llvm/include/llvm/Analysis/IVDescriptors.h =================================================================== --- llvm/include/llvm/Analysis/IVDescriptors.h +++ llvm/include/llvm/Analysis/IVDescriptors.h @@ -268,6 +268,16 @@ cast(I)->getIntrinsicID() == Intrinsic::fmuladd; } + /// Collect cast instructions given a reduction exit value and the minimal + /// type in which the reduction can be represented. If \p IgnoreCasts is true + /// then instructions that can be ignored in the vectorizer's cost model + /// are collected. Otherwise, casts on the input operands of the reduction + /// operation are collected. + static void collectCastInstrs(Loop *TheLoop, Instruction *Exit, + Type *RecurrenceType, + SmallPtrSetImpl &Casts, + bool IgnoreCasts); + private: // The starting value of the recurrence. // It does not have to be zero! Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -159,12 +159,9 @@ IsSigned); } -/// Collect cast instructions that can be ignored in the vectorizer's cost -/// model, given a reduction exit value and the minimal type in which the -/// reduction can be represented. -static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, - Type *RecurrenceType, - SmallPtrSetImpl &Casts) { +void RecurrenceDescriptor::collectCastInstrs( + Loop *TheLoop, Instruction *Exit, Type *RecurrenceType, + SmallPtrSetImpl &Casts, bool IgnoreCasts) { SmallVector Worklist; SmallPtrSet Visited; @@ -173,15 +170,25 @@ while (!Worklist.empty()) { Instruction *Val = Worklist.pop_back_val(); Visited.insert(Val); - if (auto *Cast = dyn_cast(Val)) - if (Cast->getSrcTy() == RecurrenceType) { - // If the source type of a cast instruction is equal to the recurrence - // type, it will be eliminated, and should be ignored in the vectorizer - // cost model. - Casts.insert(Cast); - continue; + if (auto *Cast = dyn_cast(Val)) { + if (IgnoreCasts) { + if (Cast->getSrcTy() == RecurrenceType) { + // If the source type of a cast instruction is equal to the recurrence + // type, it will be eliminated, and should be ignored in the + // vectorizer cost model. + Casts.insert(Cast); + continue; + } + } else { + if (Cast->getDestTy() == RecurrenceType) { + // Add casts with destination type equal to the recurrence type. These + // are checked by the vectorizer when finding the widest type for + // in-loop reductions without any loads/stores. + Casts.insert(Cast); + continue; + } } - + } // Add all operands to the work list if they are loop-varying values that // we haven't yet visited. for (Value *O : cast(Val)->operands()) @@ -512,7 +519,8 @@ // without needing a white list of instructions to ignore. // This may also be useful for the inloop reductions, if it can be // kept simple enough. - collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts); + collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts, + true); } // We found a reduction var if we have reached the original phi node and we Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5975,11 +5975,39 @@ unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - for (Type *T : ElementTypesInLoop) { - MinWidth = std::min( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); - MaxWidth = std::max( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + // For in-loop reductions, no element types are added to ElementTypesInLoop + // if there are no loads/stores in the loop. In this case, check through the + // reduction variables to determine the maximum width. + if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { + // Reset MaxWidth so that we can find the smallest type used by recurrences + // in the loop. + MaxWidth = -1U; + for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; + MaxWidth = std::min( + MaxWidth, + DL.getTypeSizeInBits(RdxDesc.getRecurrenceType()->getScalarType()) + .getFixedSize()); + // We also need to check the cast instructions in the loop as there may be + // extends on the input operands of the recurrence. + SmallPtrSet CastInstrs; + RecurrenceDescriptor::collectCastInstrs( + TheLoop, RdxDesc.getLoopExitInstr(), RdxDesc.getRecurrenceType(), + CastInstrs, false); + for (const Instruction *I : CastInstrs) { + MaxWidth = std::min( + MaxWidth, + DL.getTypeSizeInBits(cast(I)->getSrcTy()->getScalarType()) + .getFixedSize()); + } + } + } else { + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } } return {MinWidth, MaxWidth}; } Index: llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-target-instruction-cost=1 -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -31,3 +31,31 @@ for.end: ret void } + +; For in-loop reductions with no loads or stores in the loop the widest type is +; determined by looking through the recurrences, which allows a sensible VF to be +; chosen. + +; CHECK-LABEL: Checking a loop in "no_loads_stores" +; CHECK: The Smallest and Widest types: 4294967295 / 16 bits +; CHECK: Selecting VF: 8 + +define double @no_loads_stores() { +entry: + br label %for.body + +for.body: + %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %i.08 = phi i16 [ 0, %entry ], [ %inc, %for.body ] + %conv = sitofp i16 %i.08 to double + %mul = fmul double %conv, %conv + %add = fadd double %s.09, %mul + %inc = add nuw nsw i16 %i.08, 1 + %exitcond.not = icmp eq i16 %inc, 12345 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + %.lcssa = phi double [ %add, %for.body ] + ret double %.lcssa +} + Index: llvm/test/Transforms/LoopVectorize/X86/funclet.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/funclet.ll +++ llvm/test/Transforms/LoopVectorize/X86/funclet.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: define void @test1( ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null] -; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ] +; CHECK: call <8 x double> @llvm.floor.v8f64(<8 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ] declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)