Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6004,7 +6004,7 @@ // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-instruction values such as arguments and // constants). - SmallPtrSet LoopInvariants; + SmallPtrSet LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -6130,11 +6130,16 @@ for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + bool IsScalar = all_of(cast(Inst)->users(), [&](User *U) { + auto *I = dyn_cast(U); + return !I || TheLoop != LI->getLoopFor(I->getParent()) || + isScalarAfterVectorization(I, VFs[i]); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); - Invariant[ClassID] += Usage; + TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); + Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ Index: llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts + +; RUN: opt -mtriple arm64-linux -passes=loop-vectorize -mattr=+sve -debug-only=loop-vectorize -disable-output <%s 2>&1 | FileCheck %s + +; Invariant register usage calculation should take into account if the +; invariant would be used in widened instructions. Only in such cases, a vector +; register would be required for holding the invariant. For all other cases +; such as below(where usage of %0 in loop doesnt require vector register), a +; general purpose register suffices. +; Check that below test doesn't crash while calculating register usage for +; invariant %0 + +@string = internal unnamed_addr constant [5 x i8] c"abcd\00", align 1 +define i32 @get_invariant_reg_usage(ptr %z) { +; CHECK: LV: Checking a loop in 'get_invariant_reg_usage' +; CHECK: LV(REG): VF = vscale x 1 +; CHECK-NEXT: LV(REG): Found max usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers + +L.entry: + %0 = load i128, ptr %z, align 16 + %1 = icmp slt i128 %0, 1 + %a = getelementptr i8, ptr %z, i64 1 + br i1 %1, label %return, label %loopbody + +loopbody: ;preds = %L.entry, %loopbody + %b = phi ptr [ %2, %loopbody ], [ @string, %L.entry ] + %len_input = phi i128 [ %len, %loopbody ], [ %0, %L.entry ] + %len = add nsw i128 %len_input, -1 + %2 = getelementptr i8, ptr %b, i64 1 + %3 = load i8, ptr %b, align 1 + store i8 %3, ptr %a, align 4 + %.not = icmp eq i128 %len, 0 + br i1 %.not, label %return, label %loopbody + +return: ;preds = %loopexit, %L.entry + ret i32 undef +} Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -175,7 +175,7 @@ ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers ;CHECK-PWR8: LV(REG): Found invariant usage: 1 item -;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 1 registers +;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers ;CHECK-PWR9: LV(REG): VF = 1 ;CHECK-PWR9: LV(REG): Found max usage: 2 item Index: llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -31,22 +31,22 @@ ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL2: LV(REG): Found max usage: 2 item ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL4: LV(REG): Found max usage: 2 item ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL8: LV(REG): Found max usage: 2 item ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers ; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers +; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers entry: %conv = zext i32 %size to i64 Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -102,7 +102,7 @@ ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 @@ -234,7 +234,7 @@ ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 Index: llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i32 @test_g(ptr nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 { entry: @@ -68,7 +68,7 @@ ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i32 @test(ptr nocapture readonly %a, i32 %n) local_unnamed_addr { entry: