Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -126,6 +126,11 @@ "trip count that is smaller than this " "value.")); +static cl::opt MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -1408,10 +1413,10 @@ /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1438,8 +1443,10 @@ unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return Returns information about the register usages of the loop for the + /// given vectorization factors. + SmallVector + calculateRegisterUsage(const SmallVector &VFs); private: /// Returns the expected execution cost. The unit of the cost does @@ -4705,7 +4712,8 @@ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned WidestType = getWidestType(); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4713,7 +4721,9 @@ WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4726,6 +4736,26 @@ " into one vector!"); unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doesn't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4801,7 +4831,9 @@ return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4841,12 +4873,14 @@ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4892,7 +4926,7 @@ TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -5002,8 +5036,9 @@ return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5024,8 +5059,8 @@ LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -5044,7 +5079,7 @@ unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); + RU.NumInstructions += (*bb)->size(); for (Instruction &I : **bb) { IdxToInstr[Index++] = &I; @@ -5079,10 +5114,26 @@ TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector RUs(VFs.size()); + SmallVector MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + // A lambda that gets the register usage for the given type and VF. + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + return std::max(1, VF * TypeSize / WidestRegister); + }; + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. @@ -5094,27 +5145,50 @@ // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + // For each VF find the maximum usage of registers. + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + if (VFs[j] == 1) { + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; + } + + // Count the number of live interals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); - - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } + + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { Index: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll @@ -0,0 +1,46 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global [1000 x i8] zeroinitializer, align 16 +@b = global [1000 x i8] zeroinitializer, align 16 +@c = global [1000 x i8] zeroinitializer, align 16 +@u = global [1000 x i32] zeroinitializer, align 16 +@v = global [1000 x i32] zeroinitializer, align 16 +@w = global [1000 x i32] zeroinitializer, align 16 + +; Tests that the vectorization factor is determined by the smallest instead of +; widest type in the loop for maximum bandwidth when +; -vectorizer-maximize-bandwidth is indicated. +; +; CHECK-label: foo +; CHECK: LV: Selecting VF: 16. +define void @foo() { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv + store i8 %add, i8* %arrayidx6, align 1 + %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx8, align 4 + %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv + store i32 %add11, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -17,7 +17,7 @@ ; widest vector count. ; ; CHECK: test_consecutive_store -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 64 / 64 bits. define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 %5 = icmp eq %0** %0, %1 @@ -51,7 +51,7 @@ ; p[i][y] = (int*) (1 + q[i]); ; } ; CHECK: test_nonconsecutive_store -; CHECK: The Widest type: 16 bits +; CHECK: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -93,7 +93,7 @@ ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. ; CHECK: test_consecutive_ptr_load -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 8 / 64 bits. define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -117,7 +117,7 @@ ;; However, we should not take unconsecutive loads of pointers into account. ; CHECK: test_nonconsecutive_ptr_load -; CHECK: The Widest type: 16 bits +; CHECK: LV: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1