Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -701,6 +701,30 @@ bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const; + /// \return True if the number of call preserved registers is available + /// for the target. This is the case when the target has a single class + /// of vector register, or when the number of preserved registers is the + /// same (e.g. the x86-64 System V ABI, where no XMM, YMM or ZMM registers + /// are preserved). If the ABI preserves different numbers depending on + /// the class it can't be handled without tracking in which class of + /// register a value will be stored. For example, Win64 preserves XMM8-XMM15. + /// If the target supports AVX, a value may be in either a XMM or a YMM + /// register, and the ABI cannot be handled. However, if the target does + /// not support AVX, a vector value can only be in a XMM register and a + /// value can be returned for the ABI. + bool hasNumberOfCallPreservedRegisters(bool Vector) const; + + /// \return The number of scalar or vector registers preserved across a + /// call. The returned value only applies when hasNumberOfPreservedRegisters + /// returns true. + unsigned getNumberOfCallPreservedRegisters(bool Vector) const; + + /// \return The maximum interleave factor that should be used for the + /// target when a loop contains a call. By default, the target + /// getMaxInterleaveFactor value is used. The returned value only applies + /// when hasNumberOfPreservedRegisters returns false. + unsigned getMaxCallInterleaveFactor(unsigned VF) const; + /// \return The size of a cache line in bytes. unsigned getCacheLineSize() const; @@ -1086,6 +1110,9 @@ virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; + virtual bool hasNumberOfCallPreservedRegisters(bool Vector) = 0; + virtual unsigned getNumberOfCallPreservedRegisters(bool Vector) = 0; + virtual unsigned getMaxCallInterleaveFactor(unsigned VF) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; @@ -1380,6 +1407,15 @@ unsigned getNumberOfRegisters(bool Vector) override { return Impl.getNumberOfRegisters(Vector); } + bool hasNumberOfCallPreservedRegisters(bool Vector) override { + return Impl.hasNumberOfCallPreservedRegisters(Vector); + } + unsigned getNumberOfCallPreservedRegisters(bool Vector) override { + return Impl.getNumberOfCallPreservedRegisters(Vector); + } + unsigned getMaxCallInterleaveFactor(unsigned VF) override { + return Impl.getMaxCallInterleaveFactor(VF); + } unsigned getRegisterBitWidth(bool Vector) const override { return Impl.getRegisterBitWidth(Vector); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -350,6 +350,12 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + bool hasNumberOfCallPreservedRegisters(bool Vector) { return false; }; + + unsigned getNumberOfCallPreservedRegisters(bool Vector) { return 0; } + + unsigned getMaxCallInterleaveFactor(unsigned VF) { return 1; } + unsigned getMinVectorRegisterBitWidth() { return 128; } bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -441,6 +441,12 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + unsigned getMaxCallInterleaveFactor(unsigned VF) { + // By default the maximum call interleave factor is the target's + // maximum interleave factor (i.e. calls have no affect). + return static_cast(this)->getMaxInterleaveFactor(VF); + } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -332,6 +332,25 @@ return TTIImpl->getNumberOfRegisters(Vector); } +bool TargetTransformInfo::hasNumberOfCallPreservedRegisters(bool Vector) const { + return TTIImpl->hasNumberOfCallPreservedRegisters(Vector); +} + +unsigned +TargetTransformInfo::getNumberOfCallPreservedRegisters(bool Vector) const { + assert( + TTIImpl->hasNumberOfCallPreservedRegisters(Vector) && + "Shouldn't be called when hasNumberOfPreservedRegisters() returns false"); + return TTIImpl->getNumberOfCallPreservedRegisters(Vector); +} + +unsigned TargetTransformInfo::getMaxCallInterleaveFactor(unsigned VF) const { + assert( + !TTIImpl->hasNumberOfCallPreservedRegisters(VF > 1) && + "Shouldn't be called when hasNumberOfPreservedRegisters() returns true"); + return TTIImpl->getMaxCallInterleaveFactor(VF); +} + unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { return TTIImpl->getRegisterBitWidth(Vector); } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -62,6 +62,8 @@ unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); + bool hasNumberOfCallPreservedRegisters(bool Vector); + unsigned getNumberOfCallPreservedRegisters(bool Vector); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -169,6 +169,31 @@ return 2; } +bool X86TTIImpl::hasNumberOfCallPreservedRegisters(bool Vector) { + if (!Vector || !ST->is64Bit()) + return false; + + // Win64 ABI preserves XMM8-XMM15. If the target does not support + // AVX, then a vector value must be in a XMM register. Otherwise + // the value could be in a YMM or ZMM register (in which case it + // will not be preserved). + if (ST->isOSWindows() && !ST->hasAVX()) + return true; + + // If the target uses the System V ABI, no vector registers are + // preserved. + return (ST->isTargetDarwin() || ST->isTargetLinux() || + ST->isTargetSolaris() || ST->isTargetKFreeBSD() || + ST->isTargetPS4()); +} + +unsigned X86TTIImpl::getNumberOfCallPreservedRegisters(bool Vector) { + if (ST->isOSWindows()) + return 8; + + return 0; +} + int X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1227,11 +1227,18 @@ /// Holds the maximum number of concurrent live intervals in the loop. unsigned MaxLocalUsers; + + /// Holds the maximum number of live registers at a callsite in the loop. + unsigned MaxCallUsers; + + /// Holds the maximum interleave factor for a call in the loop. + unsigned MaxCallIC; }; /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector calculateRegisterUsage(ArrayRef VFs); + SmallVector calculateRegisterUsage(ArrayRef VFs, + bool handleCalls = false); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -3193,6 +3200,7 @@ auto *I2 = cast(T2->getVectorElementType()); return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; } + static Type *largestIntegerVectorType(Type *T1, Type *T2) { auto *I1 = cast(T1->getVectorElementType()); auto *I2 = cast(T2->getVectorElementType()); @@ -5169,7 +5177,7 @@ TargetNumRegisters = ForceTargetNumVectorRegs; } - RegisterUsage R = calculateRegisterUsage({VF})[0]; + RegisterUsage R = calculateRegisterUsage({VF}, true)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -5190,6 +5198,12 @@ IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / std::max(1U, (R.MaxLocalUsers - 1))); + // If the loop contains a call with live values at the callsite, use an IC + // no greater than the maximum call IC (rounded down to a power of 2). An + // IC above this will cause additional spilling. + if (R.MaxCallUsers > 0) + IC = std::min(IC, (unsigned)PowerOf2Floor(R.MaxCallIC)); + // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5278,7 +5292,8 @@ } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs, + bool handleCalls) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5362,6 +5377,8 @@ SmallVector RUs(VFs.size()); SmallVector MaxUsages(VFs.size(), 0); + SmallVector MaxCallUsages(VFs.size(), 0); + SmallVector MaxCallIC(VFs.size(), -1); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5391,21 +5408,81 @@ // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { - if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; - } - collectUniformsAndScalars(VFs[j]); - // Count the number of live intervals. unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst) || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + if (VFs[j] == 1) + RegUsage = OpenIntervals.size(); + else { + collectUniformsAndScalars(VFs[j]); + // Count the number of live intervals. + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.count(Inst) || + isScalarAfterVectorization(Inst, VFs[j])) + continue; + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + } } MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + + // If the instruction is a call calculate the register pressure. + if (CallInst *CI = dyn_cast(I)) { + if (!handleCalls) + continue; + + MaxCallUsages[j] = std::max(MaxCallUsages[j], RegUsage); + + if (!TTI.hasNumberOfCallPreservedRegisters(VFs[j] > 1)) { + MaxCallIC[j] = TTI.getMaxCallInterleaveFactor(VFs[j]); + continue; + } + + if (RegUsage == 0) + continue; + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VFs[j], TTI, TLI, + NeedToScalarize); + + // If the vector intrinsic is cheaper than the vector call + // assume it will be expanded, leaving no call (if it is the + // same cost it is likely to be scalarized). + if (ID && getVectorIntrinsicCost(CI, VFs[j], TTI, TLI) < CallCost) + continue; + + // Calculate the number of live values that are not used after the + // call. This is the number of values that "end" at the next + // instruction. + unsigned DeadAfterCall = 0; + + if ((i + 1) == Index) + DeadAfterCall = RegUsage; + else { + InstrList &List = TransposeEnds[i + 1]; + if (VFs[j] == 1) + DeadAfterCall = List.size(); + else + for (Instruction *Inst : List) + DeadAfterCall += GetRegUsage(Inst->getType(), VFs[j]); + } + + unsigned RetUsage = VFs[j] == 1 ? 1 : GetRegUsage(I->getType(), VFs[j]); + + // The interleave factor for the call is the number of times the call + // and live values at the callsite can be cloned without causing + // additional spilling. For the live values, the number is simply + // the number of preserved registers across the call, divided by the + // number of live values. However, the calculation must also take into + // account the additional registers needed for the return values of + // the cloned calls (the return value for the last call will not need + // to be preserved). Similarly, some of the live values may not be + // used after the call (i.e. call arguments). When the additional + // registers for the return equals the number of registers dead after + // the call they cancel each other out. + unsigned PR = TTI.getNumberOfCallPreservedRegisters(VFs[j] > 1); + unsigned Diff = DeadAfterCall > RetUsage ? 0 : RetUsage - DeadAfterCall; + MaxCallIC[j] = std::min(MaxCallIC[j], (PR + Diff) / (RegUsage + Diff)); + } } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5431,6 +5508,16 @@ RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; + + if (handleCalls) { + LLVM_DEBUG(dbgs() << "LV(REG): Found max call usage: " + << MaxCallUsages[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found max call IC: " + << MaxCallIC[i] << '\n'); + RU.MaxCallUsers = MaxCallUsages[i]; + RU.MaxCallIC = MaxCallIC[i]; + } + RUs[i] = RU; } Index: test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll =================================================================== --- test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll +++ test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=btver2 -vector-library=SVML -loop-vectorize < %s | FileCheck %s + +; This test checks that when a call is vectorized with a vector library call +; the interleave count used is 1 (i.e. the call appears only once). As loops +; with reductions are treated specially by the cost model we also test this +; case. Finally, a test is included that checks that no restriction of the +; interleave count is done when the call is not vectorized to a vector library +; call. + +; CHECK-LABEL: test +; CHECK: call <8 x float> @__svml_sinf8 +; CHECK-NOT: call <8 x float> @__svml_sinf8 + +define void @sinf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %call = tail call float @sinf(float %0) + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare dso_local float @sinf(float) local_unnamed_addr + +; CHECK-LABEL: sinf-reduc-test +; CHECK: call fast <8 x float> @__svml_sinf8 +; CHECK-NOT: call fast <8 x float> @__svml_sinf8 + +define float @sinf-reduc-test(float* nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %s.0.lcssa + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %s.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %1 = tail call fast float @llvm.sin.f32(float %0) + %add = fadd fast float %1, %s.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.sin.f32(float) + +; CHECK-LABEL: ceilf-test +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 + +define void @ceilf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %1 = tail call float @llvm.ceil.f32(float %0) + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.ceil.f32(float)