diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -77,10 +77,12 @@ RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K, FastMathFlags FMF, Instruction *ExactFP, Type *RT, bool Signed, bool Ordered, - SmallPtrSetImpl &CI) + SmallPtrSetImpl &CI, + unsigned MinWidthCastToRecurTy) : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), - IsOrdered(Ordered) { + IsOrdered(Ordered), + MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) { CastInsts.insert(CI.begin(), CI.end()); } @@ -251,6 +253,11 @@ /// recurrence. const SmallPtrSet &getCastInsts() const { return CastInsts; } + /// Returns the minimum width used by the recurrence in bits. + unsigned getMinWidthCastToRecurrenceTypeInBits() const { + return MinWidthCastToRecurrenceType; + } + /// Returns true if all source operands of the recurrence are SExtInsts. bool isSigned() const { return IsSigned; } @@ -291,6 +298,8 @@ bool IsOrdered = false; // Instructions used for type-promoting the recurrence. SmallPtrSet CastInsts; + // The minimum width used by the recurrence. + unsigned MinWidthCastToRecurrenceType; }; /// A struct for saving information about induction variables. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -161,19 +161,22 @@ /// Collect cast instructions that can be ignored in the vectorizer's cost /// model, given a reduction exit value and the minimal type in which the -/// reduction can be represented. -static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, - Type *RecurrenceType, - SmallPtrSetImpl &Casts) { +// reduction can be represented. Also search casts to the recurrence type +// to find the minimum width used by the recurrence. +static void collectCastInstrs(Loop *TheLoop, Instruction *Exit, + Type *RecurrenceType, + SmallPtrSetImpl &Casts, + unsigned &MinWidthCastToRecurTy) { SmallVector Worklist; SmallPtrSet Visited; Worklist.push_back(Exit); + MinWidthCastToRecurTy = -1U; while (!Worklist.empty()) { Instruction *Val = Worklist.pop_back_val(); Visited.insert(Val); - if (auto *Cast = dyn_cast(Val)) + if (auto *Cast = dyn_cast(Val)) { if (Cast->getSrcTy() == RecurrenceType) { // If the source type of a cast instruction is equal to the recurrence // type, it will be eliminated, and should be ignored in the vectorizer @@ -181,7 +184,16 @@ Casts.insert(Cast); continue; } - + if (Cast->getDestTy() == RecurrenceType) { + // The minimum width used by the recurrence is found by checking for + // casts on its operands. The minimum width is used by the vectorizer + // when finding the widest type for in-loop reductions without any + // loads/stores. + MinWidthCastToRecurTy = std::min( + MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits()); + continue; + } + } // Add all operands to the work list if they are loop-varying values that // we haven't yet visited. for (Value *O : cast(Val)->operands()) @@ -265,6 +277,7 @@ // Data used for determining if the recurrence has been type-promoted. Type *RecurrenceType = Phi->getType(); SmallPtrSet CastInsts; + unsigned MinWidthCastToRecurrenceType; Instruction *Start = Phi; bool IsSigned = false; @@ -500,21 +513,24 @@ computeRecurrenceType(ExitInstruction, DB, AC, DT); if (ComputedType != RecurrenceType) return false; - - // The recurrence expression will be represented in a narrower type. If - // there are any cast instructions that will be unnecessary, collect them - // in CastInsts. Note that the 'and' instruction was already included in - // this list. - // - // TODO: A better way to represent this may be to tag in some way all the - // instructions that are a part of the reduction. The vectorizer cost - // model could then apply the recurrence type to these instructions, - // without needing a white list of instructions to ignore. - // This may also be useful for the inloop reductions, if it can be - // kept simple enough. - collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts); } + // Collect cast instructions and the minimum width used by the recurrence. + // If the starting value is not the same as the phi node and the computed + // recurrence type is equal to the recurrence type, the recurrence expression + // will be represented in a narrower or wider type. If there are any cast + // instructions that will be unnecessary, collect them in CastsFromRecurTy. + // Note that the 'and' instruction was already included in this list. + // + // TODO: A better way to represent this may be to tag in some way all the + // instructions that are a part of the reduction. The vectorizer cost + // model could then apply the recurrence type to these instructions, + // without needing a white list of instructions to ignore. + // This may also be useful for the inloop reductions, if it can be + // kept simple enough. + collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts, + MinWidthCastToRecurrenceType); + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -524,7 +540,8 @@ // Save the description of this reduction variable. RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ReduxDesc.getExactFPMathInst(), RecurrenceType, - IsSigned, IsOrdered, CastInsts); + IsSigned, IsOrdered, CastInsts, + MinWidthCastToRecurrenceType); RedDes = RD; return true; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5960,11 +5960,29 @@ unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - for (Type *T : ElementTypesInLoop) { - MinWidth = std::min( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); - MaxWidth = std::max( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + // For in-loop reductions, no element types are added to ElementTypesInLoop + // if there are no loads/stores in the loop. In this case, check through the + // reduction variables to determine the maximum width. + if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { + // Reset MaxWidth so that we can find the smallest type used by recurrences + // in the loop. + MaxWidth = -1U; + for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; + // When finding the min width used by the recurrence we need to account + // for casts on the input operands of the recurrence. + MaxWidth = std::min( + MaxWidth, std::min( + RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), + RdxDesc.getRecurrenceType()->getScalarSizeInBits())); + } + } else { + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } } return {MinWidth, MaxWidth}; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-target-instruction-cost=1 -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -31,3 +31,74 @@ for.end: ret void } + +; For in-loop reductions with no loads or stores in the loop the widest type is +; determined by looking through the recurrences, which allows a sensible VF to be +; chosen. The following 3 cases check different combinations of widths. + +; CHECK-LABEL: Checking a loop in "no_loads_stores_32" +; CHECK: The Smallest and Widest types: 4294967295 / 32 bits +; CHECK: Selecting VF: 4 + +define double @no_loads_stores_32(i32 %n) { +entry: + br label %for.body + +for.body: + %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %conv = sitofp i32 %i.08 to float + %conv1 = fpext float %conv to double + %add = fadd double %s.09, %conv1 + %inc = add nuw i32 %i.08, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + %.lcssa = phi double [ %add, %for.body ] + ret double %.lcssa +} + +; CHECK-LABEL: Checking a loop in "no_loads_stores_16" +; CHECK: The Smallest and Widest types: 4294967295 / 16 bits +; CHECK: Selecting VF: 8 + +define double @no_loads_stores_16() { +entry: + br label %for.body + +for.body: + %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %i.08 = phi i16 [ 0, %entry ], [ %inc, %for.body ] + %conv = sitofp i16 %i.08 to double + %add = fadd double %s.09, %conv + %inc = add nuw nsw i16 %i.08, 1 + %exitcond.not = icmp eq i16 %inc, 12345 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + %.lcssa = phi double [ %add, %for.body ] + ret double %.lcssa +} + +; CHECK-LABEL: Checking a loop in "no_loads_stores_8" +; CHECK: The Smallest and Widest types: 4294967295 / 8 bits +; CHECK: Selecting VF: 16 + +define float @no_loads_stores_8() { +entry: + br label %for.body + +for.body: + %s.09 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %i.08 = phi i8 [ 0, %entry ], [ %inc, %for.body ] + %conv = sitofp i8 %i.08 to float + %add = fadd float %s.09, %conv + %inc = add nuw nsw i8 %i.08, 1 + %exitcond.not = icmp eq i8 %inc, 12345 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + %.lcssa = phi float [ %add, %for.body ] + ret float %.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll --- a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll @@ -33,7 +33,7 @@ ; CHECK-LABEL: define void @test1( ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null] -; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ] +; CHECK: call <8 x double> @llvm.floor.v8f64(<8 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ] declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)