Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -125,6 +125,11 @@ "trip count that is smaller than this " "value.")); +static cl::opt MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -1363,10 +1368,10 @@ /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1393,8 +1398,10 @@ unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return information about the register usages of the loop for the given + /// vectorization factors. + SmallVector + calculateRegisterUsage(const SmallVector &VFs); private: /// Returns the expected execution cost. The unit of the cost does @@ -4562,7 +4569,8 @@ unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4570,19 +4578,41 @@ WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doen't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); - if (MaxVectorSize == 0) { + if (VF == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); - MaxVectorSize = 1; + VF = 1; } - assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" - " into one vector!"); - - unsigned VF = MaxVectorSize; + assert(VF <= 64 && + "Did not expect to pack so many elements into one vector!"); // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4658,7 +4688,9 @@ return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = 1024; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4698,12 +4730,14 @@ if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4749,7 +4783,7 @@ TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4859,8 +4893,9 @@ return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -4881,8 +4916,8 @@ LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -4901,7 +4936,7 @@ unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); + RU.NumInstructions += (*bb)->size(); for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; ++it) { Instruction *I = it; @@ -4938,14 +4973,25 @@ TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector RUs(VFs.size()); + SmallVector MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. - if (!Ends.count(I)) continue; + if (!Ends.count(I)) + continue; // Skip ignored values. if (ValuesToIgnore.count(I)) @@ -4953,27 +4999,46 @@ // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + // Count the number of live interals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) { + auto TypeSize = + (unsigned)DL.getTypeSizeInBits(Inst->getType()->getScalarType()); + RegUsage += std::max(1, VFs[j] * TypeSize / WidestRegister); + } + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + for (auto Inst : LoopInvariants) { + auto TypeSize = + (unsigned)DL.getTypeSizeInBits(Inst->getType()->getScalarType()); + Invariant += std::max(1, VFs[i] * TypeSize / WidestRegister); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { Index: test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll @@ -0,0 +1,45 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global [1000 x i8] zeroinitializer, align 16 +@b = global [1000 x i8] zeroinitializer, align 16 +@c = global [1000 x i8] zeroinitializer, align 16 +@u = global [1000 x i32] zeroinitializer, align 16 +@v = global [1000 x i32] zeroinitializer, align 16 +@w = global [1000 x i32] zeroinitializer, align 16 + +; Tests that the vectorization factor is determined by the smallest instead of +; widest type in the loop for maximum bandwidth when +; -vectorizer-maximize-bandwidth is indicated. +; +; CHECK-label: foo +; CHECK: LV: Selecting VF: 16. +define void @foo() { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv + store i8 %add, i8* %arrayidx6, align 1 + %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx8, align 4 + %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv + store i32 %add11, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -17,7 +17,7 @@ ; widest vector count. ; ; CHECK: test_consecutive_store -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 64 / 64 bits. define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 %5 = icmp eq %0** %0, %1 @@ -51,7 +51,7 @@ ; p[i][y] = (int*) (1 + q[i]); ; } ; CHECK: test_nonconsecutive_store -; CHECK: The Widest type: 16 bits +; CHECK: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -93,7 +93,7 @@ ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. ; CHECK: test_consecutive_ptr_load -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 8 / 64 bits. define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -117,7 +117,7 @@ ;; However, we should not take unconsecutive loads of pointers into account. ; CHECK: test_nonconsecutive_ptr_load -; CHECK: The Widest type: 16 bits +; CHECK: LV: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1