diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -788,10 +788,23 @@ /// Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; - /// \return The number of scalar or vector registers that the target has. - /// If 'Vectors' is true, it returns the number of vector registers. If it is - /// set to false, it returns the number of scalar registers. - unsigned getNumberOfRegisters(bool Vector) const; + /// \return the number of registers in the target-provided register class. + unsigned getNumberOfRegisters(unsigned ClassID) const; + + /// \return the target-provided register class ID for the provided type, + /// accounting for type promotion and other type-legalization techniques that the target might apply. + /// However, it specifically does not account for the scalarization or splitting of vector types. + /// Should a vector type require scalarization or splitting into multiple underlying vector registers, + /// that type should be mapped to a register class containing no registers. + /// Specifically, this is designed to provide a simple, high-level view of the register allocation + /// later performed by the backend. These register classes don't necessarily map onto the + /// register classes used by the backend. + /// FIXME: It's not currently possible to determine how many registers + /// are used by the provided type. + unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; + + /// \return the target-provided register class name + const char* getRegisterClassName(unsigned ClassID) const; /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; @@ -1243,7 +1256,9 @@ Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; - virtual unsigned getNumberOfRegisters(bool Vector) = 0; + virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0; + virtual unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const = 0; + virtual const char* getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; @@ -1586,8 +1601,14 @@ Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } - unsigned getNumberOfRegisters(bool Vector) override { - return Impl.getNumberOfRegisters(Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const override { + return Impl.getNumberOfRegisters(ClassID); + } + unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const override { + return Impl.getRegisterClassForType(Vector, Ty); + } + const char* getRegisterClassName(unsigned ClassID) const override { + return Impl.getRegisterClassName(ClassID); } unsigned getRegisterBitWidth(bool Vector) const override { return Impl.getRegisterBitWidth(Vector); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -354,7 +354,20 @@ return TTI::TCC_Free; } - unsigned getNumberOfRegisters(bool Vector) { return 8; } + unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; } + + unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const { + return Vector ? 1 : 0; + }; + + const char* getRegisterClassName(unsigned ClassID) const { + switch (ClassID) { + default: + return "Generic::Unknown Register Class"; + case 0: return "Generic::ScalarRC"; + case 1: return "Generic::VectorRC"; + } + } unsigned getRegisterBitWidth(bool Vector) const { return 32; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -519,8 +519,6 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } - unsigned getRegisterBitWidth(bool Vector) const { return 32; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -466,8 +466,16 @@ return Cost; } -unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { - return TTIImpl->getNumberOfRegisters(Vector); +unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const { + return TTIImpl->getNumberOfRegisters(ClassID); +} + +unsigned TargetTransformInfo::getRegisterClassForType(bool Vector, Type *Ty) const { + return TTIImpl->getRegisterClassForType(Vector, Ty); +} + +const char* TargetTransformInfo::getRegisterClassName(unsigned ClassID) const { + return TTIImpl->getRegisterClassName(ClassID); } unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -85,7 +85,8 @@ bool enableInterleavedAccessVectorization() { return true; } - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 32; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -122,7 +122,8 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 16; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -72,7 +72,13 @@ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); - unsigned getNumberOfRegisters(bool Vector); + + enum PPCRegisterClass { + GPRRC, FPRRC, VRRC, VSXRC + }; + unsigned getNumberOfRegisters(unsigned ClassID) const; + unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; + const char* getRegisterClassName(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize(); unsigned getPrefetchDistance(); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -594,10 +594,37 @@ return true; } -unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { - if (Vector && !ST->hasAltivec() && !ST->hasQPX()) - return 0; - return ST->hasVSX() ? 64 : 32; +unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + assert(ClassID == GPRRC || ClassID == FPRRC || + ClassID == VRRC || ClassID == VSXRC); + if (ST->hasVSX()) { + assert(ClassID == GPRRC || ClassID == VSXRC); + return ClassID == GPRRC ? 32 : 64; + } + assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC); + return 32; +} + +unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { + if (Vector) + return ST->hasVSX() ? VSXRC : VRRC; + else if (Ty && Ty->getScalarType()->isFloatTy()) + return ST->hasVSX() ? VSXRC : FPRRC; + else + return GPRRC; +} + +const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { + + switch (ClassID) { + default: + llvm_unreachable("unknown register class"); + return "PPC::unknown register class"; + case GPRRC: return "PPC::GPRRC"; + case FPRRC: return "PPC::FPRRC"; + case VRRC: return "PPC::VRRC"; + case VSXRC: return "PPC::VSXRC"; + } } unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,7 +56,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize() { return 256; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -304,7 +304,8 @@ C2.ScaleCost, C2.SetupCost); } -unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { +unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (!Vector) // Discount the stack pointer. Also leave out %r0, since it can't // be used in an address. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -53,7 +53,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -25,10 +25,11 @@ return TargetTransformInfo::PSK_FastHardware; } -unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) { - unsigned Result = BaseT::getNumberOfRegisters(Vector); +unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const { + unsigned Result = BaseT::getNumberOfRegisters(ClassID); // For SIMD, use at least 16 registers, as a rough guess. + bool Vector = (ClassID == 1); if (Vector) Result = std::max(Result, 16u); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -116,7 +116,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -116,7 +116,8 @@ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { +unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector && !ST->hasSSE1()) return 0; diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h --- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -40,7 +40,8 @@ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { return 0; } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1386,7 +1386,9 @@ // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as // additional instruction (at least fill). - unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1; + // TODO: Need distinguish register class? + unsigned TTIRegNum = TTI->getNumberOfRegisters( + TTI->getRegisterClassForType(false, F.getType())) - 1; if (C.NumRegs > TTIRegNum) { // Cost already exceeded TTIRegNum, then only newly added register can add // new instructions. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -983,10 +983,11 @@ /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - + /// The key is ClassID of target-provided register class. + SmallMapVector LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; + /// The key is ClassID of target-provided register class. + SmallMapVector MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -4962,9 +4963,14 @@ // Select the largest VF which doesn't require more registers than existing // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + bool Selected = true; + for (auto& pair : RUs[i].MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + if (pair.second > TargetNumRegisters) + Selected = false; + } + if (Selected) { MaxVF = VFs[i]; break; } @@ -5115,22 +5121,12 @@ if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); - - if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; - } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; - } - RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + for (auto& pair : R.MaxLocalUsers) { + pair.second = std::max(pair.second, 1U); + } // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5143,13 +5139,35 @@ // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / - R.MaxLocalUsers); + unsigned IC = UINT_MAX; - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) - IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / - std::max(1U, (R.MaxLocalUsers - 1))); + for (auto& pair : R.MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers of " + << TTI.getRegisterClassName(pair.first) << " register class\n"); + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } + unsigned MaxLocalUsers = pair.second; + unsigned LoopInvariantRegs = 0; + if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) + LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; + + unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) { + TmpIC = + PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); + } + + IC = std::min(IC, TmpIC); + } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5331,7 +5349,7 @@ const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector RUs(VFs.size()); - SmallVector MaxUsages(VFs.size(), 0); + SmallVector, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5361,21 +5379,45 @@ // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + // Count the number of live intervals. + SmallMapVector RegUsage; + if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; + for (auto Inst : OpenIntervals) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } + } else { + collectUniformsAndScalars(VFs[j]); + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + continue; + if (isScalarAfterVectorization(Inst, VFs[j])) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } else { + unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); + else + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + } + } } - collectUniformsAndScalars(VFs[j]); - // Count the number of live intervals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + + for (auto& pair : RegUsage) { + if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) + MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); + else + MaxUsages[j][pair.first] = pair.second; } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5386,18 +5428,32 @@ } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - if (VFs[i] == 1) - Invariant = LoopInvariants.size(); - else { - for (auto Inst : LoopInvariants) - Invariant += GetRegUsage(Inst->getType(), VFs[i]); + SmallMapVector Invariant; + + for (auto Inst : LoopInvariants) { + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + if (Invariant.find(ClassID) == Invariant.end()) + Invariant[ClassID] = Usage; + else + Invariant[ClassID] += Usage; } LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant - << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " + << MaxUsages[i].size() << " item\n"); + for (const auto& pair : MaxUsages[i]) { + LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) + << ", " << pair.second << " registers \n"); + } + LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " + << Invariant.size() << " item\n"); + for (const auto& pair : Invariant) { + LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) + << ", " << pair.second << " registers \n"); + } RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -7762,7 +7818,8 @@ // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && + TTI->getMaxInterleaveFactor(1) < 2) return false; bool Changed = false; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5237,7 +5237,7 @@ // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(true)) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) return false; // Don't vectorize when the attribute NoImplicitFloat is used. diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll copy from llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll copy to llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -1,19 +1,38 @@ -; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s -; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F -; REQUIRES: asserts +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9 @a = global [1024 x i8] zeroinitializer, align 16 @b = global [1024 x i8] zeroinitializer, align 16 define i32 @foo() { -; This function has a loop of SAD pattern. Here we check when VF = 16 the -; register usage doesn't exceed 16. ; ; CHECK-LABEL: foo + ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item + +; CHECK-PWR8: LV(REG): VF = 16 +; CHECK-PWR8-NEXT: LV(REG): Found max usage: 2 item +; CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers +; CHECK-PWR8-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-PWR8: Setting best plan to VF=16, UF=4 + +; CHECK-PWR9: LV(REG): VF = 8 +; CHECK-PWR9-NEXT: LV(REG): Found max usage: 2 item +; CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers +; CHECK-PWR9-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-PWR9: Setting best plan to VF=8, UF=8 + entry: br label %for.body @@ -47,9 +66,23 @@ ; available vector register number. ; CHECK-LABEL: goo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item + +; CHECK: Setting best plan to VF=16, UF=4 + entry: br label %for.body @@ -80,9 +113,14 @@ define i64 @bar(i64* nocapture %a) { ; CHECK-LABEL: bar -; CHECK: LV(REG): VF = 2 -; CHECK: LV(REG): Found max usage: 3 -; +; CHECK: LV(REG): VF = 2 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item + +; CHECK: Setting best plan to VF=2, UF=12 + entry: br label %for.body @@ -108,13 +146,18 @@ @c = external global [0 x i32], align 4 define void @hoo(i32 %n) { -; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can -; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64> -; so the max usage of AVX512 vector register will be 2. -; AVX512F-LABEL: bar -; AVX512F: LV(REG): VF = 16 -; AVX512F: LV(REG): Found max usage: 2 -; +; CHECK-LABEL: hoo +; CHECK: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK: LV(REG): VF = 1 +; CHECK-NEXT: LV(REG): Found max usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK: Setting best plan to VF=1, UF=12 + entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -22,7 +22,11 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Checking a loop in "test_g" -; CHECK: LV(REG): Found max usage: 2 +; CHECK: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 { entry: @@ -60,7 +64,11 @@ } ; CHECK: LV: Checking a loop in "test" -; CHECK: LV(REG): Found max usage: 2 +; CHECK: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -11,9 +11,15 @@ ; ; CHECK-LABEL: foo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item entry: br label %for.body @@ -47,9 +53,15 @@ ; available vector register number. ; CHECK-LABEL: goo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item entry: br label %for.body @@ -81,8 +93,11 @@ define i64 @bar(i64* nocapture %a) { ; CHECK-LABEL: bar ; CHECK: LV(REG): VF = 2 -; CHECK: LV(REG): Found max usage: 3 -; +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item + entry: br label %for.body @@ -113,8 +128,11 @@ ; so the max usage of AVX512 vector register will be 2. ; AVX512F-LABEL: bar ; AVX512F: LV(REG): VF = 16 -; AVX512F: LV(REG): Found max usage: 2 -; +; AVX512F-CHECK: LV(REG): Found max usage: 2 item +; AVX512F-CHECK: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; AVX512F-CHECK: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; AVX512F-CHECK: LV(REG): Found invariant usage: 0 item + entry: br label %for.body