Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -794,10 +794,20 @@ /// Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return True if float type registers overlap with integer type. + /// In most targets, scalar float type registers don't overlap with + /// integer type fully, but vector float type registers do because + /// both float and integer vector type reside in same registers normally. + /// Now we assume the target float register number is same as integer so + /// the overlapping is fully not partially. + bool isFloatOverlapIntRegs(bool Vector) const; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. - unsigned getNumberOfRegisters(bool Vector) const; + /// If 'IsFloatTy' is used, it can return different register numbers + /// for float and integer registers + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) const; /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; @@ -1251,7 +1261,8 @@ Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; - virtual unsigned getNumberOfRegisters(bool Vector) = 0; + virtual unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) = 0; + virtual bool isFloatOverlapIntRegs(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; @@ -1596,8 +1607,11 @@ Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } - unsigned getNumberOfRegisters(bool Vector) override { - return Impl.getNumberOfRegisters(Vector); + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) override { + return Impl.getNumberOfRegisters(Vector, IsFloatTy); + } + virtual bool isFloatOverlapIntRegs(bool Vector) override { + return Impl.isFloatOverlapIntRegs(Vector); } unsigned getRegisterBitWidth(bool Vector) const override { return Impl.getRegisterBitWidth(Vector); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -358,7 +358,11 @@ return TTI::TCC_Free; } - unsigned getNumberOfRegisters(bool Vector) { return 8; } + bool isFloatOverlapIntRegs(bool Vector) { + return Vector; + } + + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -519,7 +519,9 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { + return Vector ? 0 : 1; + } unsigned getRegisterBitWidth(bool Vector) const { return 32; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -480,8 +480,13 @@ return Cost; } -unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { - return TTIImpl->getNumberOfRegisters(Vector); +bool TargetTransformInfo::isFloatOverlapIntRegs(bool Vector) const { + return TTIImpl->isFloatOverlapIntRegs(Vector); +} + +unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector, + bool IsFloatTy) const { + return TTIImpl->getNumberOfRegisters(Vector, IsFloatTy); } unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -85,7 +85,7 @@ bool enableInterleavedAccessVectorization() { return true; } - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (Vector) { if (ST->hasNEON()) return 32; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -136,7 +136,7 @@ } unsigned getHardwareNumberOfRegisters(bool Vector) const; - unsigned getNumberOfRegisters(bool Vector) const; + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -231,7 +231,7 @@ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); unsigned getHardwareNumberOfRegisters(bool Vec) const; - unsigned getNumberOfRegisters(bool Vec) const; + unsigned getNumberOfRegisters(bool Vec, bool IsFloatTy) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -218,7 +218,7 @@ return 256; } -unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { +unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec, bool IsFloatTy) const { // This is really the number of registers to fill when vectorizing / // interleaving loops, so we lie to avoid trying to use all registers. return getHardwareNumberOfRegisters(Vec) >> 3; @@ -682,7 +682,7 @@ return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { +unsigned R600TTIImpl::getNumberOfRegisters(bool Vec, bool IsFloatTy) const { return getHardwareNumberOfRegisters(Vec); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -122,7 +122,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (Vector) { if (ST->hasNEON()) return 16; Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -76,7 +76,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool vector) const; + unsigned getNumberOfRegisters(bool vector, bool IsFloatTy) const; unsigned getMaxInterleaveFactor(unsigned VF); unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -91,7 +91,7 @@ /// --- Vector TTI begin --- -unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { +unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) const { if (Vector) return useHVX() ? 32 : 0; return 32; Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -66,7 +66,7 @@ // vectorizers but disables heuristics based on the number of registers. // FIXME: Return a more reasonable number, while keeping an eye on // LoopVectorizer's unrolling heuristics. - unsigned getNumberOfRegisters(bool Vector) const { return 1; } + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) const { return 1; } // Only <2 x half> should be vectorized, so always return 32 for the vector // register size. Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -72,7 +72,7 @@ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy); unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize(); unsigned getPrefetchDistance(); Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -594,10 +594,14 @@ return true; } -unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { +unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; - return ST->hasVSX() ? 64 : 32; + if (Vector) + return ST->hasVSX() ? 64 : 32; + + // For scalar type + return ST->hasVSX() && IsFloatTy ? 64 : 32; } unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,7 +56,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy); unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize() { return 256; } Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -304,7 +304,7 @@ C2.ScaleCost, C2.SetupCost); } -unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { +unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (!Vector) // Discount the stack pointer. Also leave out %r0, since it can't // be used in an address. Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -53,7 +53,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy); unsigned getRegisterBitWidth(bool Vector) const; unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -25,8 +25,8 @@ return TargetTransformInfo::PSK_FastHardware; } -unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) { - unsigned Result = BaseT::getNumberOfRegisters(Vector); +unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) { + unsigned Result = BaseT::getNumberOfRegisters(Vector, IsFloatTy); // For SIMD, use at least 16 registers, as a rough guess. if (Vector) Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -115,7 +115,7 @@ /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy); unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -118,7 +118,7 @@ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { +unsigned X86TTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (Vector && !ST->hasSSE1()) return 0; Index: llvm/lib/Target/XCore/XCoreTargetTransformInfo.h =================================================================== --- llvm/lib/Target/XCore/XCoreTargetTransformInfo.h +++ llvm/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -40,7 +40,7 @@ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { if (Vector) { return 0; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -204,12 +204,18 @@ /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; -static cl::opt ForceTargetNumScalarRegs( - "force-target-num-scalar-regs", cl::init(0), cl::Hidden, +static cl::opt ForceTargetNumIntScalarRegs( + "force-target-num-int-scalar-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of scalar registers.")); +static cl::opt ForceTargetNumFloatScalarRegs( + "force-target-num-float-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); -static cl::opt ForceTargetNumVectorRegs( - "force-target-num-vector-regs", cl::init(0), cl::Hidden, +static cl::opt ForceTargetNumIntVectorRegs( + "force-target-num-int-vector-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of vector registers.")); +static cl::opt ForceTargetNumFloatVectorRegs( + "force-target-num-float-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); static cl::opt ForceTargetMaxScalarInterleaveFactor( @@ -982,11 +988,14 @@ /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { - /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - - /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; + /// Holds the number of integer loop invariant values that are used in the loop. + unsigned IntLoopInvariantRegs; + /// Holds the maximum number of integer concurrent live intervals in the loop. + unsigned IntMaxLocalUsers; + /// Holds the number of float loop invariant values that are used in the loop. + unsigned FloatLoopInvariantRegs; + /// Holds the maximum number of float concurrent live intervals in the loop. + unsigned FloatMaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -4928,11 +4937,28 @@ // Select the largest VF which doesn't require more registers than existing // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + unsigned TargetIntNumRegisters = TTI.getNumberOfRegisters(true); + unsigned TargetFloatNumRegisters = TTI.getNumberOfRegisters(true, true); + bool Overlap = TTI.isFloatOverlapIntRegs(true); + for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { - MaxVF = VFs[i]; - break; + if (Overlap) { + // We assume target float register number is same as integer register + // if Overlap is true. So it's fully overlapping and we can choose either + // one as target register number. + unsigned TargetNumRegisters = TargetFloatNumRegisters; + + if (RUs[i].IntMaxLocalUsers + RUs[i].FloatMaxLocalUsers <= + TargetNumRegisters) { + MaxVF = VFs[i]; + break; + } + } else { + if (RUs[i].IntMaxLocalUsers <= TargetIntNumRegisters && + RUs[i].FloatMaxLocalUsers <= TargetFloatNumRegisters) { + MaxVF = VFs[i]; + break; + } } } if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { @@ -5081,22 +5107,29 @@ if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); + unsigned TargetIntNumRegisters = TTI.getNumberOfRegisters(VF > 1); + unsigned TargetFloatNumRegisters = TTI.getNumberOfRegisters(VF > 1, true); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetIntNumRegisters + << " integer registers, " << TargetFloatNumRegisters + << " float registers." << '\n'); if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; + if (ForceTargetNumIntScalarRegs.getNumOccurrences() > 0) + TargetIntNumRegisters = ForceTargetNumIntScalarRegs; + if (ForceTargetNumFloatScalarRegs.getNumOccurrences() > 0) + TargetFloatNumRegisters = ForceTargetNumFloatScalarRegs; } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; + if (ForceTargetNumIntVectorRegs.getNumOccurrences() > 0) + TargetIntNumRegisters = ForceTargetNumIntVectorRegs; + if (ForceTargetNumFloatVectorRegs.getNumOccurrences() > 0) + TargetFloatNumRegisters = ForceTargetNumFloatVectorRegs; } RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + R.IntMaxLocalUsers = std::max(R.IntMaxLocalUsers, 1U); + R.FloatMaxLocalUsers = std::max(R.FloatMaxLocalUsers, 1U); // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5109,13 +5142,43 @@ // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / - R.MaxLocalUsers); + unsigned IC = 1; + + bool Overlap = TTI.isFloatOverlapIntRegs(VF > 1); + + if (!Overlap) { + unsigned IntIC = PowerOf2Floor( + (TargetIntNumRegisters - R.IntLoopInvariantRegs) / R.IntMaxLocalUsers); + unsigned FloatIC = + PowerOf2Floor((TargetFloatNumRegisters - R.FloatLoopInvariantRegs) / + R.FloatMaxLocalUsers); + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) { + IntIC = + PowerOf2Floor((TargetIntNumRegisters - R.IntLoopInvariantRegs - 1) / + std::max(1U, (R.IntMaxLocalUsers - 1))); + FloatIC = PowerOf2Floor( + (TargetFloatNumRegisters - R.FloatLoopInvariantRegs - 1) / + std::max(1U, (R.FloatMaxLocalUsers - 1))); + } - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) - IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / - std::max(1U, (R.MaxLocalUsers - 1))); + IC = std::min(IntIC, FloatIC); + + } else { + // We assume target float register number is same as integer register + // if Overlap is true. So it's fully overlapping and we can choose either + // one as target register number. + unsigned TargetNumRegisters = TargetFloatNumRegisters; + unsigned LoopInvariantRegs = + R.IntLoopInvariantRegs + R.FloatLoopInvariantRegs; + unsigned MaxLocalUsers = R.IntMaxLocalUsers + R.FloatMaxLocalUsers; + + IC = + PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + if (EnableIndVarRegisterHeur) + IC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); + } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5297,7 +5360,8 @@ const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector RUs(VFs.size()); - SmallVector MaxUsages(VFs.size(), 0); + SmallVector IntMaxUsages(VFs.size(), 0); + SmallVector FloatMaxUsages(VFs.size(), 0); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5327,21 +5391,33 @@ // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { - if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; - } - collectUniformsAndScalars(VFs[j]); // Count the number of live intervals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + unsigned IntRegUsage = 0; + unsigned FloatRegUsage = 0; + + if (VFs[j] == 1) { + for (auto Inst : OpenIntervals) { + if (Inst->getType()->isFloatTy()) + FloatRegUsage += 1; + else + IntRegUsage += 1; + } + } else { + collectUniformsAndScalars(VFs[j]); + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || + (isScalarAfterVectorization(Inst, VFs[j]))) + continue; + if (Inst->getType()->isFloatTy()) + FloatRegUsage += GetRegUsage(Inst->getType(), VFs[j]); + else + IntRegUsage += GetRegUsage(Inst->getType(), VFs[j]); + } } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + + IntMaxUsages[j] = std::max(IntMaxUsages[j], IntRegUsage); + FloatMaxUsages[j] = std::max(FloatMaxUsages[j], FloatRegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5352,21 +5428,27 @@ } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - if (VFs[i] == 1) - Invariant = LoopInvariants.size(); - else { - for (auto Inst : LoopInvariants) - Invariant += GetRegUsage(Inst->getType(), VFs[i]); + unsigned IntInvariant = 0; + unsigned FloatInvariant = 0; + + for (auto Inst : LoopInvariants) { + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + if (Inst->getType()->isFloatTy()) + FloatInvariant += Usage; + else + IntInvariant += Usage; } LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant - << '\n'); - - RU.LoopInvariantRegs = Invariant; - RU.MaxLocalUsers = MaxUsages[i]; + LLVM_DEBUG(dbgs() << "LV(REG): Found max int usage: " << IntMaxUsages[i] + << ", max float usage: " << FloatMaxUsages[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found int invariant usage: " << IntInvariant + << ", float invariant usage: " << FloatInvariant << '\n'); + + RU.IntLoopInvariantRegs = IntInvariant; + RU.FloatLoopInvariantRegs = FloatInvariant; + RU.IntMaxLocalUsers = IntMaxUsages[i]; + RU.FloatMaxLocalUsers = FloatMaxUsages[i]; RUs[i] = RU; } Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -0,0 +1,160 @@ +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9 + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @foo() { +; +; CHECK-LABEL: foo + +; CHECK: LV(REG): VF = 8 +; CHECK-NEXT: LV(REG): Found max int usage: 7, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 + +; CHECK-PWR8: LV(REG): VF = 16 +; CHECK-PWR8-NEXT: LV(REG): Found max int usage: 13, max float usage: 0 +; CHECK-PWR8-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK-PWR8: Setting best plan to VF=16, UF=4 + +; CHECK-PWR9: LV(REG): VF = 8 +; CHECK-PWR9-NEXT: LV(REG): Found max int usage: 7, max float usage: 0 +; CHECK-PWR9-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK-PWR9: Setting best plan to VF=8, UF=8 + + +entry: + br label %for.body + +for.cond.cleanup: + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv3 + %ispos = icmp sgt i32 %sub, -1 + %neg = sub nsw i32 0, %sub + %2 = select i1 %ispos, i32 %sub, i32 %neg + %add = add nsw i32 %2, %s.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i32 @goo() { +; For indvars.iv used in a computating chain only feeding into getelementptr or cmp, +; it will not have vector version and the vector register usage will not exceed the +; available vector register number. +; CHECK-LABEL: goo +; CHECK: LV(REG): VF = 8 +; CHECK-NEXT: LV(REG): Found max int usage: 7, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 + +; CHECK: Setting best plan to VF=16, UF=4 + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %tmp1 = add nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1 + %tmp = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %tmp to i32 + %tmp2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2 + %tmp3 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %tmp3 to i32 + %sub = sub nsw i32 %conv, %conv3 + %ispos = icmp sgt i32 %sub, -1 + %neg = sub nsw i32 0, %sub + %tmp4 = select i1 %ispos, i32 %sub, i32 %neg + %add = add nsw i32 %tmp4, %s.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i64 @bar(i64* nocapture %a) { +; CHECK-LABEL: bar +; CHECK: LV(REG): VF = 2 +; CHECK-NEXT: LV(REG): Found max int usage: 3, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 + +; CHECK: Setting best plan to VF=2, UF=12 + +entry: + br label %for.body + +for.cond.cleanup: + %add2.lcssa = phi i64 [ %add2, %for.body ] + ret i64 %add2.lcssa + +for.body: + %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.012 + store i64 %add, i64* %arrayidx, align 8 + %add2 = add nsw i64 %add, %s.011 + %inc = add nuw nsw i64 %i.012, 1 + %exitcond = icmp eq i64 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +@d = external global [0 x i64], align 8 +@e = external global [0 x i32], align 4 +@c = external global [0 x i32], align 4 + +define void @hoo(i32 %n) { +; CHECK-LABEL: hoo +; CHECK: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): Found max int usage: 2, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 +; CHECK: LV(REG): VF = 1 +; CHECK-NEXT: LV(REG): Found max int usage: 2, max float usage: 0 +; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0 + +; CHECK: Setting best plan to VF=1, UF=12 + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv + %tmp = load i64, i64* %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp + %tmp1 = load i32, i32* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv + store i32 %tmp1, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -22,7 +22,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Checking a loop in "test_g" -; CHECK: LV(REG): Found max usage: 2 +; CHECK: LV(REG): Found max int usage: 2 define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 { entry: @@ -60,7 +60,7 @@ } ; CHECK: LV: Checking a loop in "test" -; CHECK: LV(REG): Found max usage: 2 +; CHECK: LV(REG): Found max int usage: 2 define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr { entry: Index: llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -11,9 +11,9 @@ ; ; CHECK-LABEL: foo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max int usage: 7 ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max int usage: 13 entry: br label %for.body @@ -47,9 +47,9 @@ ; available vector register number. ; CHECK-LABEL: goo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK-NEXT: LV(REG): Found max int usage: 7 ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 13 +; CHECK-NEXT: LV(REG): Found max int usage: 13 entry: br label %for.body @@ -81,7 +81,7 @@ define i64 @bar(i64* nocapture %a) { ; CHECK-LABEL: bar ; CHECK: LV(REG): VF = 2 -; CHECK: LV(REG): Found max usage: 3 +; CHECK: LV(REG): Found max int usage: 3 ; entry: br label %for.body @@ -110,10 +110,10 @@ define void @hoo(i32 %n) { ; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can ; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64> -; so the max usage of AVX512 vector register will be 2. +; so the max int usage of AVX512 vector register will be 2. ; AVX512F-LABEL: bar ; AVX512F: LV(REG): VF = 16 -; AVX512F: LV(REG): Found max usage: 2 +; AVX512F: LV(REG): Found max int usage: 2 ; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/unroll_novec.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/unroll_novec.ll +++ llvm/test/Transforms/LoopVectorize/unroll_novec.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-target-num-int-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"