This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/RISCV/
-
Target/
-
RISCV/
-
RISCVRegisterInfo.td
-
test/CodeGen/RISCV/rvv/
-
CodeGen/
-
RISCV/
-
rvv/
-
vsetvli-insert-crossbb.ll
-
vsetvli-insert.ll

Differential D118020

[RISCV] Set CostPerUse for floating point registers
Needs ReviewPublic

Authored by wangpc on Jan 24 2022, 2:00 AM.

Download Raw Diff

Details

Reviewers

jrtc27
craig.topper
asb

Summary

Set CostPerUse to 1 for floating point registers when RVC is
enabled so that more compressed instructions will be generated.

Code size and performance have some improvements.

SPEC FP 2006 (On Allwinner's D1 chip, with XuanTie C906):

                 Code size     Performance
453.povray        -1.145%        +7.926%
433.milc             -           +1.399%
450.soplex        -0.905%        +1.177%
470.lbm              -           +0.188%
444.namd          -1.882%        +0.124%
447.dealII        -0.440%        +0.053%
482.sphinx3          -           -1.569%

For CSiBE, we reduced 1%-5% code size under -Oz.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

• pcwang-thead created this revision.Jan 24 2022, 2:00 AM

Herald added subscribers: VincentWu, luke957, achieveartificialintelligence and 25 others. · View Herald TranscriptJan 24 2022, 2:00 AM

• pcwang-thead requested review of this revision.Jan 24 2022, 2:00 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 24 2022, 2:00 AM

Herald added subscribers: llvm-commits, eopXD, MaskRay. · View Herald Transcript

Harbormaster completed remote builds in B145182: Diff 402435.Jan 24 2022, 4:12 AM

This is causing multiple compiler-time failures (assertions) for me on the GCC torture suite. e.g. 930608-1.c for rv32imafdc with the ilp32 ABI at O{1,2,3,s}. pr44942.c fails similarly for rv32imafdc with the ilp32d ABI at O{1,2,3,s}. It seems likely it's unmasking a bug elsewhere, though I haven't done any more delving.

clang: /home/asb/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h:378: llvm::SlotIndex llvm::LiveRange::beginIndex() const: Assertion `!empty() && "Call to beginIndex() on empty range."' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /home/asb/llvm-project/build/default/bin/clang -cc1 -triple riscv32-unknown-linux-gnu -S -save-temps=obj -disable-free -clear-ast-before-backend -main-file-name 930608-1.c -mrelocation-model static -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-feature +m -target-feature +a -target-feature +f -target-feature +d -target-feature +c -target-feature +relax -target-feature -save-restore -target-abi ilp32 -msmall-data-limit 8 -mllvm -treat-scalable-fixed-error-as-warning -debugger-tuning=gdb -fcoverage-compilation-dir=/home/asb/torture -resource-dir /home/asb/llvm-project/build/release/lib/clang/14.0.0 -O1 -fdebug-compilation-dir=/home/asb/torture -ferror-limit 19 -fno-signed-char -fgnuc-version=4.2.1 -Qn -faddrsig -o ./output_rv32imafdc_ilp32_O1/930608-1.s 930608-1.c
1.	<eof> parser at end of file
2.	Code generation
3.	Running pass 'Function Pass Manager' on module '930608-1.c'.
4.	Running pass 'Greedy Register Allocator' on function '@f'
 #0 0x00007f8398acc207 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/asb/llvm-project/llvm/lib/Support/Unix/Signals.inc:565:11
 #1 0x00007f8398acc41b PrintStackTraceSignalHandler(void*) /home/asb/llvm-project/llvm/lib/Support/Unix/Signals.inc:632:1
 #2 0x00007f8398aca7d7 llvm::sys::RunSignalHandlers() /home/asb/llvm-project/llvm/lib/Support/Signals.cpp:97:5
 #3 0x00007f8398accc31 SignalHandler(int) /home/asb/llvm-project/llvm/lib/Support/Unix/Signals.inc:0:3
 #4 0x00007f83a1c01870 __restore_rt sigaction.c:0:0
 #5 0x00007f839815fd22 raise (/usr/lib/libc.so.6+0x3cd22)
 #6 0x00007f8398149862 abort (/usr/lib/libc.so.6+0x26862)
 #7 0x00007f8398149747 _nl_load_domain.cold loadmsgcat.c:0:0
 #8 0x00007f8398158616 (/usr/lib/libc.so.6+0x35616)
 #9 0x00007f839f29bbf4 llvm::LiveRange::beginIndex() const /home/asb/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h:0:7
#10 0x00007f839f2aa9ce llvm::LiveIntervals::intervalIsInOneMBB(llvm::LiveInterval const&) const /home/asb/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp:837:24
#11 0x00007f839f702144 llvm::DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(llvm::LiveInterval&, llvm::MCRegister, bool, llvm::EvictionCost&, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> > const&) const /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:500:23
#12 0x00007f839f703368 llvm::DefaultEvictionAdvisor::tryFindEvictionCandidate(llvm::LiveInterval&, llvm::AllocationOrder const&, unsigned char, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> > const&) const /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:783:9
#13 0x00007f839f7019a8 llvm::RAGreedy::tryEvict(llvm::LiveInterval&, llvm::AllocationOrder&, llvm::SmallVectorImpl<llvm::Register>&, unsigned char, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> > const&) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:809:39
#14 0x00007f839f7012c7 llvm::RAGreedy::tryAssign(llvm::LiveInterval&, llvm::AllocationOrder&, llvm::SmallVectorImpl<llvm::Register>&, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> > const&) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:408:25
#15 0x00007f839f70bd31 llvm::RAGreedy::selectOrSplitImpl(llvm::LiveInterval&, llvm::SmallVectorImpl<llvm::Register>&, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> >&, unsigned int) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:2629:11
#16 0x00007f839f70c79d llvm::RAGreedy::selectOrSplit(llvm::LiveInterval&, llvm::SmallVectorImpl<llvm::Register>&) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:2361:20
#17 0x00007f839f70c94c non-virtual thunk to llvm::RAGreedy::selectOrSplit(llvm::LiveInterval&, llvm::SmallVectorImpl<llvm::Register>&) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:0:0
#18 0x00007f839f6e1ffd llvm::RegAllocBase::allocatePhysRegs() /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp:112:35
#19 0x00007f839f70f7cb llvm::RAGreedy::runOnMachineFunction(llvm::MachineFunction&) /home/asb/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp:2942:3
#20 0x00007f839f42c4cc llvm::MachineFunctionPass::runOnFunction(llvm::Function&) /home/asb/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp:72:8
#21 0x00007f8399c7a6fd llvm::FPPassManager::runOnFunction(llvm::Function&) /home/asb/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1435:23
#22 0x00007f8399c7fcdf llvm::FPPassManager::runOnModule(llvm::Module&) /home/asb/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1481:16
#23 0x00007f8399c7b0aa (anonymous namespace)::MPPassManager::runOnModule(llvm::Module&) /home/asb/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1550:23
#24 0x00007f8399c7abbd llvm::legacy::PassManagerImpl::run(llvm::Module&) /home/asb/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:540:16
#25 0x00007f8399c7ffe1 llvm::legacy::PassManager::run(llvm::Module&) /home/asb/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1677:3
#26 0x00007f83a0076ef5 (anonymous namespace)::EmitAssemblyHelper::RunCodegenPipeline(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >&, std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile> >&) /home/asb/llvm-project/clang/lib/CodeGen/BackendUtil.cpp:1535:3
#27 0x00007f83a006ecb4 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) /home/asb/llvm-project/clang/lib/CodeGen/BackendUtil.cpp:1566:7
#28 0x00007f83a006d3f5 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::StringRef, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) /home/asb/llvm-project/clang/lib/CodeGen/BackendUtil.cpp:1727:5
#29 0x00007f83a083b1ef clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) /home/asb/llvm-project/clang/lib/CodeGen/CodeGenAction.cpp:370:7
#30 0x00007f8392b908ef clang::ParseAST(clang::Sema&, bool, bool) /home/asb/llvm-project/clang/lib/Parse/ParseAST.cpp:178:12
#31 0x00007f839d4b1a2c clang::ASTFrontendAction::ExecuteAction() /home/asb/llvm-project/clang/lib/Frontend/FrontendAction.cpp:1076:1
#32 0x00007f83a08342eb clang::CodeGenAction::ExecuteAction() /home/asb/llvm-project/clang/lib/CodeGen/CodeGenAction.cpp:1107:5
#33 0x00007f839d4b1359 clang::FrontendAction::Execute() /home/asb/llvm-project/clang/lib/Frontend/FrontendAction.cpp:971:7
#34 0x00007f839d3b669f clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) /home/asb/llvm-project/clang/lib/Frontend/CompilerInstance.cpp:1030:23
#35 0x00007f83a1bd1c44 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) /home/asb/llvm-project/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp:261:8
#36 0x00005607d246ddcc cc1_main(llvm::ArrayRef<char const*>, char const*, void*) /home/asb/llvm-project/clang/tools/driver/cc1_main.cpp:246:13
#37 0x00005607d246018a ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) /home/asb/llvm-project/clang/tools/driver/driver.cpp:317:5
#38 0x00005607d245f1d9 main /home/asb/llvm-project/clang/tools/driver/driver.cpp:388:5
#39 0x00007f839814ab25 __libc_start_main (/usr/lib/libc.so.6+0x27b25)
#40 0x00005607d245e9be _start (/home/asb/llvm-project/build/default/bin/clang+0x3d9be)
Aborted (core dumped)

In D118020#3265940, @asb wrote:

This is causing multiple compiler-time failures (assertions) for me on the GCC torture suite. e.g. 930608-1.c for rv32imafdc with the ilp32 ABI at O{1,2,3,s}. pr44942.c fails similarly for rv32imafdc with the ilp32d ABI at O{1,2,3,s}. It seems likely it's unmasking a bug elsewhere, though I haven't done any more delving.

That's weird, I will have a look.

Thanks.

• pcwang-thead mentioned this in D118124: [regalloc] Fix assertion error when LiveInterval is empty.Jan 25 2022, 4:17 AM

• pcwang-thead mentioned this in rG859745827802: [regalloc] Fix assertion error when LiveInterval is empty.Jan 25 2022, 10:08 PM

In D118020#3265955, @pcwang-thead wrote:

In D118020#3265940, @asb wrote:

This is causing multiple compiler-time failures (assertions) for me on the GCC torture suite. e.g. 930608-1.c for rv32imafdc with the ilp32 ABI at O{1,2,3,s}. pr44942.c fails similarly for rv32imafdc with the ilp32d ABI at O{1,2,3,s}. It seems likely it's unmasking a bug elsewhere, though I haven't done any more delving.

That's weird, I will have a look.

Thanks.

I may have fixed this bug (these two GCC torture tests are OK at least ) in D118124.

But I have the same question as MatzeB said, why will it generate such code?

For example, f in 930608-1.c only contains one return instruction with GCC for RISCV or clang for ARM/AArch64 target:

double f (double a) {}
f:
	ret

And below is what we have now:

f: 
	addi	sp, sp, -16
	fsd	fs1, 8(sp)
	lw	a0, 8(sp)
	lw	a1, 12(sp)
	addi	sp, sp, 16
	ret

In the MIR level, we can eliminate IMPLICIT_DEFs after processimpdefs pass:

bb.0.entry:
  %4:fpr64 = IMPLICIT_DEF                                             # Eliminated
  FSD killed %4:fpr64, %stack.0, 0 :: (store (s64) into %stack.0)     # -> FSD killed undef %4:fpr64, %stack.0, 0 :: (store (s64) into %stack.0)
  %2:gpr = LW %stack.0, 0 :: (load (s32) from %stack.0, align 8)
  %3:gpr = LW %stack.0, 4 :: (load (s32) from %stack.0 + 4, basealign 8)
  $x10 = COPY %2:gpr
  $x11 = COPY %3:gpr
  PseudoRET implicit $x10, implicit $x11

But we can't eliminate these dead stores.

I guess there may have some problems in handling BuildPairF64 and SplitF64.

liaolucy added a subscriber: liaolucy.Jan 25 2022, 11:10 PM

In D118020#3271549, @pcwang-thead wrote:
In D118020#3265955, @pcwang-thead wrote:

In D118020#3265940, @asb wrote:

This is causing multiple compiler-time failures (assertions) for me on the GCC torture suite. e.g. 930608-1.c for rv32imafdc with the ilp32 ABI at O{1,2,3,s}. pr44942.c fails similarly for rv32imafdc with the ilp32d ABI at O{1,2,3,s}. It seems likely it's unmasking a bug elsewhere, though I haven't done any more delving.

That's weird, I will have a look.

Thanks.

I may have fixed this bug (these two GCC torture tests are OK at least ) in D118124.

But I have the same question as MatzeB said, why will it generate such code?

For example, f in 930608-1.c only contains one return instruction with GCC for RISCV or clang for ARM/AArch64 target:
double f (double a) {}
f:
	ret
And below is what we have now:
f: 
	addi	sp, sp, -16
	fsd	fs1, 8(sp)
	lw	a0, 8(sp)
	lw	a1, 12(sp)
	addi	sp, sp, 16
	ret
In the MIR level, we can eliminate IMPLICIT_DEFs after processimpdefs pass:
bb.0.entry:
  %4:fpr64 = IMPLICIT_DEF                                             # Eliminated
  FSD killed %4:fpr64, %stack.0, 0 :: (store (s64) into %stack.0)     # -> FSD killed undef %4:fpr64, %stack.0, 0 :: (store (s64) into %stack.0)
  %2:gpr = LW %stack.0, 0 :: (load (s32) from %stack.0, align 8)
  %3:gpr = LW %stack.0, 4 :: (load (s32) from %stack.0 + 4, basealign 8)
  $x10 = COPY %2:gpr
  $x11 = COPY %3:gpr
  PseudoRET implicit $x10, implicit $x11
But we can't eliminate these dead stores.

I guess there may have some problems in handling BuildPairF64 and SplitF64.

We probably need a DAG combine to turn SplitF64 undef into a pair of integer undefs.

Ping.

Rebase.

Harbormaster completed remote builds in B148892: Diff 407747.Feb 10 2022, 8:33 PM

I'm surprised this resulted in performance increases. I might have guessed that with so few FP instructions being compressible, the further constraint on register selection might be more likely to result in a (slight) decrease in performance. Shows the value of running the benchmarks!

I've put this patch on the agenda for the RISC-V LLVM call today, but based on the data so far this seems to make sense.

In D118020#3329779, @asb wrote:

I'm surprised this resulted in performance increases. I might have guessed that with so few FP instructions being compressible, the further constraint on register selection might be more likely to result in a (slight) decrease in performance. Shows the value of running the benchmarks!

I've put this patch on the agenda for the RISC-V LLVM call today, but based on the data so far this seems to make sense.

I am surprised too.

IMO, there is a possible reason that may explain the performance increases:

When register number is in [8, 15], instructions can be compressed.
For the first 16 integer registers, registers x0-x4(and sometimes x5) are reserved for special usage, and the register allocation orders are like below:

def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
    (sequence "X%u", 10, 17),
    (sequence "X%u", 5, 7),
    (sequence "X%u", 28, 31),
    (sequence "X%u", 8, 9),
    (sequence "X%u", 18, 27),
    (sequence "X%u", 0, 4)
  )> {
  let RegInfos = XLenRI;
}

which means we will allocates most RVC integer registers first.
So, for most programs, there is minimal difference whether we set CostPerUse to 0 or 1.

For the first 16 float registers, there is no reserved register, and the register allocation orders are like below:

def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
    (sequence "F%u_F", 0, 7),
    (sequence "F%u_F", 10, 17),
    (sequence "F%u_F", 28, 31),
    (sequence "F%u_F", 8, 9),
    (sequence "F%u_F", 18, 27)
)>;

which means we will allocates temporary float registers first and most float instructions can't be compressed.
So when we set CostPerUse to 1, a lot of float instructions can be compressed, which results in improvements on icache misses.

In D118020#3331582, @pcwang-thead wrote:
In D118020#3329779, @asb wrote:

I'm surprised this resulted in performance increases. I might have guessed that with so few FP instructions being compressible, the further constraint on register selection might be more likely to result in a (slight) decrease in performance. Shows the value of running the benchmarks!

I've put this patch on the agenda for the RISC-V LLVM call today, but based on the data so far this seems to make sense.

I am surprised too.

IMO, there is a possible reason that may explain the performance increases:

When register number is in [8, 15], instructions can be compressed.

For the first 16 integer registers, registers x0-x4(and sometimes x5) are reserved for special usage, and the register allocation orders are like below:
def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
    (sequence "X%u", 10, 17),
    (sequence "X%u", 5, 7),
    (sequence "X%u", 28, 31),
    (sequence "X%u", 8, 9),
    (sequence "X%u", 18, 27),
    (sequence "X%u", 0, 4)
  )> {
  let RegInfos = XLenRI;
}
which means we will allocates most RVC integer registers first.
So, for most programs, there is minimal difference whether we set CostPerUse to 0 or 1.

For the first 16 float registers, there is no reserved register, and the register allocation orders are like below:
def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
    (sequence "F%u_F", 0, 7),
    (sequence "F%u_F", 10, 17),
    (sequence "F%u_F", 28, 31),
    (sequence "F%u_F", 8, 9),
    (sequence "F%u_F", 18, 27)
)>;
which means we will allocates temporary float registers first and most float instructions can't be compressed.
So when we set CostPerUse to 1, a lot of float instructions can be compressed, which results in improvements on icache misses.

Our confusion is that there are only 8 float opcodes that have compressed forms. They are all loads/stores and 4 of them are limited to RV32.

I ran 453.povray from SPEC2006 on a SiFive Unmatched board with this patch applied to our downstream compiler. My result was a 1% decrease in performance.

Should we look at what happens if we change the allocation order to use compressible argument registers first without changing the cost?

Our confusion is that there are only 8 float opcodes that have compressed forms. They are all loads/stores and 4 of them are limited to RV32.

I ran 453.povray from SPEC2006 on a SiFive Unmatched board with this patch applied to our downstream compiler. My result was a 1% decrease in performance.

I ran the whole SPEC2006 FP for several times and got the same performance increases(about 1% under geometric mean).
The reason why we got different results may be that there are some differences between micro-archs and downstreams(not for sure). I will run it again with pure upstream later.

Should we look at what happens if we change the allocation order to use compressible argument registers first without changing the cost?

Yes I think it makes sense. I will have a try.
@craig.topper

Results of running SPEC2006 FP with upstream Clang/LLVM (based on 8ad6d5e465bba198c883e699c28690b0ea79400d) and options are -march=rv64imafdc -mabi=lp64d without micro-arch specific scheduling.
All benchmarks ran 10 times and we compared the arithmetic average run times.

Set CostPerUse to 1:

Benchmark	Performance
447.dealII	+2.087%
453.povray	+1.354%
450.soplex	+0.8854%
482.sphinx3	+0.8112%
433.milc	+0.743%
470.lbm	+0.1381%
444.namd	-0.714%

Geometric mean: +0.7544%

Allocate argument floating-point register first:

Benchmark	Performance
453.povray	+1.866%
450.soplex	+0.6809%
447.dealII	+0.1095%
433.milc	-0.01403%
444.namd	-0.1019%
470.lbm	-0.4855%
482.sphinx3	-0.6987%

Geometric mean: +0.1906%

Both 1 and 2:

Benchmark	Performance
482.sphinx3	+1.006%
433.milc	+0.9639%
450.soplex	+0.88%
453.povray	+0.6788%
447.dealII	+0.04175%
470.lbm	-0.118%
444.namd	-0.4662%

Geometric mean: +0.425%

All of them have code size reductions.

@craig.topper @asb

In D118020#3344580, @pcwang-thead wrote:

Results of running SPEC2006 FP with upstream Clang/LLVM (based on 8ad6d5e465bba198c883e699c28690b0ea79400d) and options are -march=rv64imafdc -mabi=lp64d without micro-arch specific scheduling.
All benchmarks ran 10 times and we compared the arithmetic average run times.

Set CostPerUse to 1:

Benchmark Performance

447.dealII +2.087%

453.povray +1.354%

450.soplex +0.8854%

482.sphinx3 +0.8112%

433.milc +0.743%

470.lbm +0.1381%

444.namd -0.714%

Geometric mean: +0.7544%

Allocate argument floating-point register first:

Benchmark Performance

453.povray +1.866%

450.soplex +0.6809%

447.dealII +0.1095%

433.milc -0.01403%

444.namd -0.1019%

470.lbm -0.4855%

482.sphinx3 -0.6987%

Geometric mean: +0.1906%

Both 1 and 2:

Benchmark Performance

482.sphinx3 +1.006%

433.milc +0.9639%

450.soplex +0.88%

453.povray +0.6788%

447.dealII +0.04175%

470.lbm -0.118%

444.namd -0.4662%

Geometric mean: +0.425%

All of them have code size reductions.

@craig.topper @asb

What optimization level was this? Was fast-math enabled?

Herald added a project: Restricted Project. · View Herald TranscriptMar 4 2022, 6:42 PM

In D118020#3361467, @craig.topper wrote:

What optimization level was this? Was fast-math enabled?

O2 and -ffast-math -fno-unroll-loops.

We disabled unroll-loops because it may remain some problems under cover.

@pcwang-thread: could you please post the version that just changes the preferred allocation order for review? We had a brief discussion in the RISC-V LLVM call and think that if that change is a positive improvement for you it may be simpler to just land that, initially at least. Did you have code size measurements for that too?

Herald added subscribers: • s, arichardson. · View Herald TranscriptMar 17 2022, 9:23 AM

In D118020#3389550, @asb wrote:

@pcwang-thread: could you please post the version that just changes the preferred allocation order for review? We had a brief discussion in the RISC-V LLVM call and think that if that change is a positive improvement for you it may be simpler to just land that, initially at least. Did you have code size measurements for that too?

Thanks! I have uploaded the patch (sees D122209).

Code size has mild improvements under O2 and Oz :
O2:

Benchmark	Code size
450.soplex	-0.064%
444.namd	-0.093%
447.dealII	-0.159%
453.povray	-0.206%

Oz:

Benchmark	Code size
483.xalancbmk	-0.033%
464.h264ref	-0.085%
447.dealII	-0.119%
453.povray	-0.14%

Herald added a subscriber: StephenFan. · View Herald TranscriptMar 21 2022, 11:01 PM

• pcwang-thead mentioned this in D146488: [RISCV] Move compressible registers to the beginning of the FP allocation order..Mar 20 2023, 8:22 PM

• pcwang-thead added a subscriber: wangpc.Jun 12 2023, 1:14 AM

Herald added subscribers: jobnoorman, luke, shiva0217. · View Herald TranscriptJun 12 2023, 1:14 AM

evandro removed a subscriber: evandro.Jun 12 2023, 2:32 PM

wangpc commandeered this revision.Jul 13 2023, 7:46 PM

wangpc added a reviewer: • pcwang-thead.

wangpc removed a reviewer: • pcwang-thead.

Ping. Can we make it the same as integer registers after D146488?

Rabse.
The performance and code size measurements are out of date.

Harbormaster completed remote builds in B247110: Diff 542774.Jul 21 2023, 12:56 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

RISCV/

RISCVRegisterInfo.td

27 lines

test/

CodeGen/

RISCV/

rvv/

vsetvli-insert-crossbb.ll

4 lines

vsetvli-insert.ll

12 lines

Diff 402435

llvm/lib/Target/RISCV/RISCVRegisterInfo.td

	Show First 20 Lines • Show All 172 Lines • ▼ Show 20 Lines
	}			}

	def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {			def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
	let RegInfos = XLenRI;			let RegInfos = XLenRI;
	}			}

	// Floating point registers			// Floating point registers
	let RegAltNameIndices = [ABIRegAltName] in {			let RegAltNameIndices = [ABIRegAltName] in {
				let CostPerUse = [0, 1] in {
	def F0_H : RISCVReg16<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;			def F0_H : RISCVReg16<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
	def F1_H : RISCVReg16<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;			def F1_H : RISCVReg16<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
	def F2_H : RISCVReg16<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;			def F2_H : RISCVReg16<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
	def F3_H : RISCVReg16<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;			def F3_H : RISCVReg16<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
	def F4_H : RISCVReg16<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;			def F4_H : RISCVReg16<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
	def F5_H : RISCVReg16<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;			def F5_H : RISCVReg16<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
	def F6_H : RISCVReg16<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;			def F6_H : RISCVReg16<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
	def F7_H : RISCVReg16<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;			def F7_H : RISCVReg16<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
				}
	def F8_H : RISCVReg16<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;			def F8_H : RISCVReg16<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
	def F9_H : RISCVReg16<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;			def F9_H : RISCVReg16<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
	def F10_H : RISCVReg16<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;			def F10_H : RISCVReg16<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
	def F11_H : RISCVReg16<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;			def F11_H : RISCVReg16<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
	def F12_H : RISCVReg16<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;			def F12_H : RISCVReg16<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
	def F13_H : RISCVReg16<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;			def F13_H : RISCVReg16<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
	def F14_H : RISCVReg16<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;			def F14_H : RISCVReg16<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
	def F15_H : RISCVReg16<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;			def F15_H : RISCVReg16<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
				let CostPerUse = [0, 1] in {
	def F16_H : RISCVReg16<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;			def F16_H : RISCVReg16<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
	def F17_H : RISCVReg16<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;			def F17_H : RISCVReg16<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
	def F18_H : RISCVReg16<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;			def F18_H : RISCVReg16<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
	def F19_H : RISCVReg16<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;			def F19_H : RISCVReg16<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
	def F20_H : RISCVReg16<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;			def F20_H : RISCVReg16<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
	def F21_H : RISCVReg16<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;			def F21_H : RISCVReg16<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
	def F22_H : RISCVReg16<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;			def F22_H : RISCVReg16<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
	def F23_H : RISCVReg16<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;			def F23_H : RISCVReg16<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
	def F24_H : RISCVReg16<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;			def F24_H : RISCVReg16<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
	def F25_H : RISCVReg16<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;			def F25_H : RISCVReg16<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
	def F26_H : RISCVReg16<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;			def F26_H : RISCVReg16<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
	def F27_H : RISCVReg16<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;			def F27_H : RISCVReg16<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
	def F28_H : RISCVReg16<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;			def F28_H : RISCVReg16<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
	def F29_H : RISCVReg16<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;			def F29_H : RISCVReg16<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
	def F30_H : RISCVReg16<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;			def F30_H : RISCVReg16<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
	def F31_H : RISCVReg16<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;			def F31_H : RISCVReg16<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
				}

	foreach Index = 0-31 in {			foreach Index = 0-7 in {
				let CostPerUse = [0, 1] in {
	def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,			def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
	DwarfRegNum<[!add(Index, 32)]>;			DwarfRegNum<[!add(Index, 32)]>;
				def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
				DwarfRegNum<[!add(Index, 32)]>;
				}
	}			}

	foreach Index = 0-31 in {			foreach Index = 8-15 in {
				def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
				DwarfRegNum<[!add(Index, 32)]>;
				def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
				DwarfRegNum<[!add(Index, 32)]>;
				}

				foreach Index = 16-31 in {
				let CostPerUse = [0, 1] in {
				def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
				DwarfRegNum<[!add(Index, 32)]>;
	def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,			def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
	DwarfRegNum<[!add(Index, 32)]>;			DwarfRegNum<[!add(Index, 32)]>;
	}			}
	}			}
				}

	// The order of registers represents the preferred allocation sequence,			// The order of registers represents the preferred allocation sequence,
	// meaning caller-save regs are listed before callee-save.			// meaning caller-save regs are listed before callee-save.
	def FPR16 : RegisterClass<"RISCV", [f16], 16, (add			def FPR16 : RegisterClass<"RISCV", [f16], 16, (add
	(sequence "F%u_H", 0, 7),			(sequence "F%u_H", 0, 7),
	(sequence "F%u_H", 10, 17),			(sequence "F%u_H", 10, 17),
	(sequence "F%u_H", 28, 31),			(sequence "F%u_H", 28, 31),
	(sequence "F%u_H", 8, 9),			(sequence "F%u_H", 8, 9),
	▲ Show 20 Lines • Show All 337 Lines • Show Last 20 Lines

llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll

	Show First 20 Lines • Show All 442 Lines • ▼ Show 20 Lines
	}			}

	define void @saxpy_vec(i64 %n, float %a, float* nocapture readonly %x, float* nocapture %y) {			define void @saxpy_vec(i64 %n, float %a, float* nocapture readonly %x, float* nocapture %y) {
	; CHECK-LABEL: saxpy_vec:			; CHECK-LABEL: saxpy_vec:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vsetvli a4, a0, e32, m8, ta, mu			; CHECK-NEXT: vsetvli a4, a0, e32, m8, ta, mu
	; CHECK-NEXT: beqz a4, .LBB8_3			; CHECK-NEXT: beqz a4, .LBB8_3
	; CHECK-NEXT: # %bb.1: # %for.body.preheader			; CHECK-NEXT: # %bb.1: # %for.body.preheader
	; CHECK-NEXT: fmv.w.x ft0, a1			; CHECK-NEXT: fmv.w.x fs1, a1
	; CHECK-NEXT: .LBB8_2: # %for.body			; CHECK-NEXT: .LBB8_2: # %for.body
	; CHECK-NEXT: # =>This Inner Loop Header: Depth=1			; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: vle32.v v8, (a2)			; CHECK-NEXT: vle32.v v8, (a2)
	; CHECK-NEXT: vle32.v v16, (a3)			; CHECK-NEXT: vle32.v v16, (a3)
	; CHECK-NEXT: slli a1, a4, 2			; CHECK-NEXT: slli a1, a4, 2
	; CHECK-NEXT: add a2, a2, a1			; CHECK-NEXT: add a2, a2, a1
	; CHECK-NEXT: vsetvli zero, a4, e32, m8, tu, mu			; CHECK-NEXT: vsetvli zero, a4, e32, m8, tu, mu
	; CHECK-NEXT: vfmacc.vf v16, ft0, v8			; CHECK-NEXT: vfmacc.vf v16, fs1, v8
	; CHECK-NEXT: vse32.v v16, (a3)			; CHECK-NEXT: vse32.v v16, (a3)
	; CHECK-NEXT: sub a0, a0, a4			; CHECK-NEXT: sub a0, a0, a4
	; CHECK-NEXT: vsetvli a4, a0, e32, m8, ta, mu			; CHECK-NEXT: vsetvli a4, a0, e32, m8, ta, mu
	; CHECK-NEXT: add a3, a3, a1			; CHECK-NEXT: add a3, a3, a1
	; CHECK-NEXT: bnez a4, .LBB8_2			; CHECK-NEXT: bnez a4, .LBB8_2
	; CHECK-NEXT: .LBB8_3: # %for.end			; CHECK-NEXT: .LBB8_3: # %for.end
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll

Show First 20 Lines • Show All 187 Lines • ▼ Show 20 Lines	%x = call <vscale x 1 x i64> @llvm.riscv.vadd.mask.nxv1i64.nxv1i64(
i64 0)		i64 0)
%y = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %x, i64 %b, i64 2)		%y = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %x, i64 %b, i64 2)
ret <vscale x 1 x i64> %y		ret <vscale x 1 x i64> %y
}		}

define <vscale x 1 x double> @test10(<vscale x 1 x double> %a, double %b) nounwind {		define <vscale x 1 x double> @test10(<vscale x 1 x double> %a, double %b) nounwind {
; CHECK-LABEL: test10:		; CHECK-LABEL: test10:
; CHECK: # %bb.0: # %entry		; CHECK: # %bb.0: # %entry
; CHECK-NEXT: fmv.d.x ft0, a0		; CHECK-NEXT: fmv.d.x fs1, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu		; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu
; CHECK-NEXT: vfmv.s.f v8, ft0		; CHECK-NEXT: vfmv.s.f v8, fs1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%x = tail call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)		%x = tail call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)
%y = call <vscale x 1 x double> @llvm.riscv.vfmv.s.f.nxv1f64(		%y = call <vscale x 1 x double> @llvm.riscv.vfmv.s.f.nxv1f64(
<vscale x 1 x double> %a, double %b, i64 1)		<vscale x 1 x double> %a, double %b, i64 1)
ret <vscale x 1 x double> %y		ret <vscale x 1 x double> %y
}		}

define <vscale x 1 x double> @test11(<vscale x 1 x double> %a, double %b) nounwind {		define <vscale x 1 x double> @test11(<vscale x 1 x double> %a, double %b) nounwind {
; CHECK-LABEL: test11:		; CHECK-LABEL: test11:
; CHECK: # %bb.0: # %entry		; CHECK: # %bb.0: # %entry
; CHECK-NEXT: fmv.d.x ft0, a0		; CHECK-NEXT: fmv.d.x fs1, a0
; CHECK-NEXT: vsetivli a0, 6, e64, m1, tu, mu		; CHECK-NEXT: vsetivli a0, 6, e64, m1, tu, mu
; CHECK-NEXT: vfmv.s.f v8, ft0		; CHECK-NEXT: vfmv.s.f v8, fs1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%x = tail call i64 @llvm.riscv.vsetvli(i64 6, i64 3, i64 0)		%x = tail call i64 @llvm.riscv.vsetvli(i64 6, i64 3, i64 0)
%y = call <vscale x 1 x double> @llvm.riscv.vfmv.s.f.nxv1f64(		%y = call <vscale x 1 x double> @llvm.riscv.vfmv.s.f.nxv1f64(
<vscale x 1 x double> %a, double %b, i64 2)		<vscale x 1 x double> %a, double %b, i64 2)
ret <vscale x 1 x double> %y		ret <vscale x 1 x double> %y
}		}

define <vscale x 1 x double> @test12(<vscale x 1 x double> %a, double %b, <vscale x 1 x i1> %mask) nounwind {		define <vscale x 1 x double> @test12(<vscale x 1 x double> %a, double %b, <vscale x 1 x i1> %mask) nounwind {
; CHECK-LABEL: test12:		; CHECK-LABEL: test12:
; CHECK: # %bb.0: # %entry		; CHECK: # %bb.0: # %entry
; CHECK-NEXT: fmv.d.x ft0, a0		; CHECK-NEXT: fmv.d.x fs1, a0
; CHECK-NEXT: vsetivli zero, 9, e64, m1, tu, mu		; CHECK-NEXT: vsetivli zero, 9, e64, m1, tu, mu
; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t		; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t
; CHECK-NEXT: vfmv.s.f v8, ft0		; CHECK-NEXT: vfmv.s.f v8, fs1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%x = call <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.f64(		%x = call <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.f64(
<vscale x 1 x double> %a,		<vscale x 1 x double> %a,
<vscale x 1 x double> %a,		<vscale x 1 x double> %a,
<vscale x 1 x double> %a,		<vscale x 1 x double> %a,
<vscale x 1 x i1> %mask,		<vscale x 1 x i1> %mask,
i64 9,		i64 9,
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines