diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1762,11 +1762,10 @@ "Reserve the stack by the multiple of one vector size."); MachineRegisterInfo &MRI = MF.getRegInfo(); - const RISCVInstrInfo *TII = MF.getSubtarget().getInstrInfo(); int64_t NumOfVReg = Amount / 8; Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass); - BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL) + BuildMI(MBB, II, DL, get(RISCV::PseudoReadVLENB), VL) .setMIFlag(Flag); assert(isInt<32>(NumOfVReg) && "Expect the number of vector registers within 32-bits."); @@ -1774,29 +1773,44 @@ uint32_t ShiftAmount = Log2_32(NumOfVReg); if (ShiftAmount == 0) return VL; - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL) + BuildMI(MBB, II, DL, get(RISCV::SLLI), VL) .addReg(VL, RegState::Kill) .addImm(ShiftAmount) .setMIFlag(Flag); + } else if ((NumOfVReg == 3 || NumOfVReg == 5 || NumOfVReg == 9) && + STI.hasStdExtZba()) { + // We can use Zba SHXADD instructions for multiply in some cases. + // TODO: Generalize to SHXADD+SLLI. + unsigned Opc; + switch (NumOfVReg) { + default: llvm_unreachable("Unexpected number of vregs"); + case 3: Opc = RISCV::SH1ADD; break; + case 5: Opc = RISCV::SH2ADD; break; + case 9: Opc = RISCV::SH3ADD; break; + } + BuildMI(MBB, II, DL, get(Opc), VL) + .addReg(VL, RegState::Kill) + .addReg(VL) + .setMIFlag(Flag); } else if (isPowerOf2_32(NumOfVReg - 1)) { Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass); uint32_t ShiftAmount = Log2_32(NumOfVReg - 1); - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister) + BuildMI(MBB, II, DL, get(RISCV::SLLI), ScaledRegister) .addReg(VL) .addImm(ShiftAmount) .setMIFlag(Flag); - BuildMI(MBB, II, DL, TII->get(RISCV::ADD), VL) + BuildMI(MBB, II, DL, get(RISCV::ADD), VL) .addReg(ScaledRegister, RegState::Kill) .addReg(VL, RegState::Kill) .setMIFlag(Flag); } else if (isPowerOf2_32(NumOfVReg + 1)) { Register ScaledRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass); uint32_t ShiftAmount = Log2_32(NumOfVReg + 1); - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), ScaledRegister) + BuildMI(MBB, II, DL, get(RISCV::SLLI), ScaledRegister) .addReg(VL) .addImm(ShiftAmount) .setMIFlag(Flag); - BuildMI(MBB, II, DL, TII->get(RISCV::SUB), VL) + BuildMI(MBB, II, DL, get(RISCV::SUB), VL) .addReg(ScaledRegister, RegState::Kill) .addReg(VL, RegState::Kill) .setMIFlag(Flag); @@ -1805,16 +1819,16 @@ if (!isInt<12>(NumOfVReg)) movImm(MBB, II, DL, N, NumOfVReg); else { - BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), N) + BuildMI(MBB, II, DL, get(RISCV::ADDI), N) .addReg(RISCV::X0) .addImm(NumOfVReg) .setMIFlag(Flag); } - if (!MF.getSubtarget().hasStdExtM()) + if (!STI.hasStdExtM()) MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{ MF.getFunction(), "M-extension must be enabled to calculate the vscaled size/offset."}); - BuildMI(MBB, II, DL, TII->get(RISCV::MUL), VL) + BuildMI(MBB, II, DL, get(RISCV::MUL), VL) .addReg(VL, RegState::Kill) .addReg(N, RegState::Kill) .setMIFlag(Flag); diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll --- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll +++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,NOZBA +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zba -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZBA define void @lmul1() nounwind { ; CHECK-LABEL: lmul1: @@ -69,17 +71,27 @@ } define void @lmul1_and_2() nounwind { -; CHECK-LABEL: lmul1_and_2: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul1_and_2: +; NOZBA: # %bb.0: +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: add sp, sp, a0 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul1_and_2: +; ZBA: # %bb.0: +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: add sp, sp, a0 +; ZBA-NEXT: ret %v1 = alloca %v2 = alloca ret void @@ -108,61 +120,103 @@ } define void @lmul1_and_4() nounwind { -; CHECK-LABEL: lmul1_and_4: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 32 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: addi sp, s0, -32 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul1_and_4: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -32 +; NOZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 32 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 2 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: addi sp, s0, -32 +; NOZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 32 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul1_and_4: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -32 +; ZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 32 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh2add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: addi sp, s0, -32 +; ZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 32 +; ZBA-NEXT: ret %v1 = alloca %v2 = alloca ret void } define void @lmul2_and_1() nounwind { -; CHECK-LABEL: lmul2_and_1: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul2_and_1: +; NOZBA: # %bb.0: +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: add sp, sp, a0 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul2_and_1: +; ZBA: # %bb.0: +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: add sp, sp, a0 +; ZBA-NEXT: ret %v1 = alloca %v2 = alloca ret void } define void @lmul4_and_1() nounwind { -; CHECK-LABEL: lmul4_and_1: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 32 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: addi sp, s0, -32 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul4_and_1: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -32 +; NOZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 32 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 2 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: addi sp, s0, -32 +; NOZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 32 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul4_and_1: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -32 +; ZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 32 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh2add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: addi sp, s0, -32 +; ZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 32 +; ZBA-NEXT: ret %v1 = alloca %v2 = alloca ret void @@ -240,21 +294,35 @@ define void @gpr_and_lmul1_and_2() nounwind { -; CHECK-LABEL: gpr_and_lmul1_and_2: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: sd a0, 8(sp) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; NOZBA-LABEL: gpr_and_lmul1_and_2: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -16 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: li a0, 3 +; NOZBA-NEXT: sd a0, 8(sp) +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: add sp, sp, a0 +; NOZBA-NEXT: addi sp, sp, 16 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: gpr_and_lmul1_and_2: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -16 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: li a0, 3 +; ZBA-NEXT: sd a0, 8(sp) +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: add sp, sp, a0 +; ZBA-NEXT: addi sp, sp, 16 +; ZBA-NEXT: ret %x1 = alloca i64 %v1 = alloca %v2 = alloca @@ -263,24 +331,42 @@ } define void @gpr_and_lmul1_and_4() nounwind { -; CHECK-LABEL: gpr_and_lmul1_and_4: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 32 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: sd a0, 8(sp) -; CHECK-NEXT: addi sp, s0, -32 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 -; CHECK-NEXT: ret +; NOZBA-LABEL: gpr_and_lmul1_and_4: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -32 +; NOZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 32 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: slli a1, a0, 2 +; NOZBA-NEXT: add a0, a1, a0 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: li a0, 3 +; NOZBA-NEXT: sd a0, 8(sp) +; NOZBA-NEXT: addi sp, s0, -32 +; NOZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 32 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: gpr_and_lmul1_and_4: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -32 +; ZBA-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 32 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: sh2add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: li a0, 3 +; ZBA-NEXT: sd a0, 8(sp) +; ZBA-NEXT: addi sp, s0, -32 +; ZBA-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 32 +; ZBA-NEXT: ret %x1 = alloca i64 %v1 = alloca %v2 = alloca