diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -28,6 +28,7 @@ RISCVGatherScatterLowering.cpp RISCVInsertVSETVLI.cpp RISCVInsertReadWriteCSR.cpp + RISCVInsertWriteVXRM.cpp RISCVInstrInfo.cpp RISCVISelDAGToDAG.cpp RISCVISelLowering.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -63,6 +63,9 @@ FunctionPass *createRISCVInsertReadWriteCSRPass(); void initializeRISCVInsertReadWriteCSRPass(PassRegistry &); +FunctionPass *createRISCVInsertWriteVXRMPass(); +void initializeRISCVInsertWriteVXRMPass(PassRegistry &); + FunctionPass *createRISCVRedundantCopyEliminationPass(); void initializeRISCVRedundantCopyEliminationPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVInsertReadWriteCSR.cpp b/llvm/lib/Target/RISCV/RISCVInsertReadWriteCSR.cpp --- a/llvm/lib/Target/RISCV/RISCVInsertReadWriteCSR.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertReadWriteCSR.cpp @@ -9,7 +9,6 @@ // of the RISC-V instructions. // // Currently the pass implements: -// -Naive insertion of a write to vxrm before an RVV fixed-point instruction. // -Writing and saving frm before an RVV floating-point instruction with a // static rounding mode and restores the value after. // @@ -58,25 +57,11 @@ INITIALIZE_PASS(RISCVInsertReadWriteCSR, DEBUG_TYPE, RISCV_INSERT_READ_WRITE_CSR_NAME, false, false) -// This function inserts a write to vxrm when encountering an RVV fixed-point -// instruction. This function also swaps frm and restores it when encountering -// an RVV floating point instruction with a static rounding mode. +// This function also swaps frm and restores it when encountering an RVV +// floating point instruction with a static rounding mode. bool RISCVInsertReadWriteCSR::emitWriteRoundingMode(MachineBasicBlock &MBB) { bool Changed = false; for (MachineInstr &MI : MBB) { - int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc()); - if (VXRMIdx >= 0) { - unsigned VXRMImm = MI.getOperand(VXRMIdx).getImm(); - - Changed = true; - - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(RISCV::WriteVXRMImm)) - .addImm(VXRMImm); - MI.addOperand(MachineOperand::CreateReg(RISCV::VXRM, /*IsDef*/ false, - /*IsImp*/ true)); - continue; - } - int FRMIdx = RISCVII::getFRMOpNum(MI.getDesc()); if (FRMIdx < 0) continue; diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp @@ -0,0 +1,339 @@ +//===-- RISCVInsertWriteVXRM.cpp - Insert Write of RISC-V VXRM CSR --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function pass that inserts WriteVXRMImm +// for rvv pseudos with a static round mode operand. +// +// To work with the intrinsics that have SideEffects, it checks if there are +// any VXRM uses in the given MachineFunction. +// If there are any, it runs emitWriteVXRMLocal instead which always saves +// incoming VCSR value and restores it for InlineAsm and VXRM users. +// +// For functions without dynamic mode, the pass consists of 3 phases: +// +// Phase 1 collects static round mode changes in each basic block +// Phase 2 propogates the round mode state to successor blocks. +// Between phase 2 and 3 we do a partial redundancy elimination to hoist VXRM +// writes out of simple loops. +// Phase 3 emits WriteVXRMImm, and assumes the incoming VXRM value based on the +// information from Phase 2. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/RISCVBaseInfo.h" +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "riscv-insert-write-vxrm" +#define RISCV_INSERT_WRITE_VXRM_NAME "RISC-V Insert Write VXRM Pass" + +namespace { + +class VXRMInfo { + uint8_t VXRMImm = 0; + + enum : uint8_t { + Uninitialized, + Static, + Unknown, + } State = Uninitialized; + +public: + VXRMInfo() {} + + static VXRMInfo getUnknown() { + VXRMInfo Info; + Info.setUnknown(); + return Info; + } + + bool isValid() const { return State != Uninitialized; } + void setUnknown() { State = Unknown; } + bool isUnknown() const { return State == Unknown; } + + bool isStatic() const { return State == Static; } + + void setVXRMImm(unsigned Imm) { + assert(Imm <= 3 && "Unexpected VXRM value"); + VXRMImm = Imm; + State = Static; + } + unsigned getVXRMImm() const { + assert(isStatic() && VXRMImm <= 3 && "Unexpected state"); + return VXRMImm; + } + + bool operator==(const VXRMInfo &Other) const { + // Uninitialized is only equal to another Uninitialized. + if (!isValid()) + return !Other.isValid(); + if (!Other.isValid()) + return !isValid(); + + // Unknown is only equal to another Unknown. + if (isUnknown()) + return Other.isUnknown(); + if (Other.isUnknown()) + return isUnknown(); + + return VXRMImm == Other.VXRMImm; + } + + bool operator!=(const VXRMInfo &Other) const { return !(*this == Other); } + + // Calculate the VXRMInfo visible to a block assuming this and Other are + // both predecessors. + VXRMInfo intersect(const VXRMInfo &Other) const { + // If the new value isn't valid, ignore it. + if (!Other.isValid()) + return *this; + + // If this value isn't valid, this must be the first predecessor, use it. + if (!isValid()) + return Other; + + // If either is unknown, the result is unknown. + if (isUnknown() || Other.isUnknown()) + return VXRMInfo::getUnknown(); + + // If we have an exact match, return this. + if (*this == Other) + return *this; + + // Otherwise the result is unknown. + return VXRMInfo::getUnknown(); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Support for debugging, callable in GDB: V->dump() + LLVM_DUMP_METHOD void dump() const { + print(dbgs()); + dbgs() << "\n"; + } + + void print(raw_ostream &OS) const { + OS << '{'; + if (!isValid()) + OS << "Uninitialized"; + else if (isUnknown()) + OS << "Unknown"; + else + OS << getVXRMImm(); + OS << '}'; + } +#endif +}; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_ATTRIBUTE_USED +inline raw_ostream &operator<<(raw_ostream &OS, const VXRMInfo &V) { + V.print(OS); + return OS; +} +#endif + +struct BlockData { + // The VXRMInfo that represents the net changes to the VXRM registers + // made by this block. Calculated in Phase 1. + VXRMInfo Change; + + // The VXRMInfo that represents the VXRM settings on exit from this + // block. Calculated in Phase 2. + VXRMInfo Exit; + + // The VXRMInfo that represents the intersection of the VXRM settings from all + // predecessor blocks. Calculated in Phase 2, and used by Phase 3. + VXRMInfo Pred; + + // Keeps track of whether the block is already in the queue. + bool InQueue; + + BlockData() = default; +}; + +class RISCVInsertWriteVXRM : public MachineFunctionPass { + const TargetInstrInfo *TII; + + std::vector BlockInfo; + std::queue WorkList; + +public: + static char ID; + + RISCVInsertWriteVXRM() : MachineFunctionPass(ID) { + initializeRISCVInsertWriteVXRMPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return RISCV_INSERT_WRITE_VXRM_NAME; + } + +private: + bool computeVXRMChanges(const MachineBasicBlock &MBB, VXRMInfo &CurInfo) const; + void computeIncomingVXRM(const MachineBasicBlock &MBB); + void emitWriteVXRM(MachineBasicBlock &MBB); +}; + +} // end anonymous namespace + +char RISCVInsertWriteVXRM::ID = 0; + +INITIALIZE_PASS(RISCVInsertWriteVXRM, DEBUG_TYPE, RISCV_INSERT_WRITE_VXRM_NAME, + false, false) + +bool RISCVInsertWriteVXRM::computeVXRMChanges(const MachineBasicBlock &MBB, + VXRMInfo &CurInfo) const { + bool NeedVXRMChange = false; + CurInfo = BlockInfo[MBB.getNumber()].Pred; + + for (const MachineInstr &MI : MBB) { + int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc()); + if (VXRMIdx >= 0) { + unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm() & 7; + NeedVXRMChange = true; + CurInfo.setVXRMImm(NewVXRMImm); + } + + if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VXRM)) + CurInfo.setUnknown(); + } + + return NeedVXRMChange; +} + +void RISCVInsertWriteVXRM::computeIncomingVXRM(const MachineBasicBlock &MBB) { + BlockData &BBInfo = BlockInfo[MBB.getNumber()]; + BBInfo.InQueue = false; + VXRMInfo InInfo; + + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + InInfo.setUnknown(); + } else { + for (const MachineBasicBlock *P : MBB.predecessors()) + InInfo = InInfo.intersect(BlockInfo[P->getNumber()].Exit); + } + + // If we don't have any valid predecessor value, wait until we do. + if (!InInfo.isValid()) + return; + + // If no change, no need to rerun block + if (InInfo == BBInfo.Pred) + return; + + BBInfo.Pred = InInfo; + LLVM_DEBUG(dbgs() << "Entry state of " << printMBBReference(MBB) + << " changed to " << BBInfo.Pred << "\n"); + + VXRMInfo TmpStatus; + computeVXRMChanges(MBB, TmpStatus); + + if (BBInfo.Exit == TmpStatus) + return; + + BBInfo.Exit = TmpStatus; + LLVM_DEBUG(dbgs() << "Exit state of " << printMBBReference(MBB) + << " changed to " << BBInfo.Exit << "\n"); + + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock *S : MBB.successors()) + if (!BlockInfo[S->getNumber()].InQueue) { + BlockInfo[S->getNumber()].InQueue = true; + WorkList.push(S); + } +} + +void RISCVInsertWriteVXRM::emitWriteVXRM(MachineBasicBlock &MBB) { + VXRMInfo CurInfo = BlockInfo[MBB.getNumber()].Pred; + + for (MachineInstr &MI : MBB) { + int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc()); + if (VXRMIdx >= 0) { + unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm(); + + if (!CurInfo.isStatic() || CurInfo.getVXRMImm() != NewVXRMImm) { + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(RISCV::WriteVXRMImm)) + .addImm(NewVXRMImm); + MI.addOperand(MachineOperand::CreateReg(RISCV::VXRM, /*IsDef*/ false, + /*IsImp*/ true)); + } + + CurInfo.setVXRMImm(NewVXRMImm); + } + + if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VXRM)) + CurInfo.setUnknown(); + } + + if (CurInfo.isValid() && CurInfo != BlockInfo[MBB.getNumber()].Exit) + report_fatal_error("Mismatched VXRM state"); +} + +bool RISCVInsertWriteVXRM::runOnMachineFunction(MachineFunction &MF) { + // Skip if the vector extension is not enabled. + const RISCVSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVInstructions()) + return false; + + TII = ST.getInstrInfo(); + + assert(BlockInfo.empty() && "Expect empty block infos"); + BlockInfo.resize(MF.getNumBlockIDs()); + + bool NeedVXRMChange = false; + // Phase 1 - Collect VXRM info + for (const MachineBasicBlock &MBB : MF) { + VXRMInfo TmpStatus; + NeedVXRMChange |= computeVXRMChanges(MBB, TmpStatus); + // Initial exit state is the last change made in the block + BlockData &BBInfo = BlockInfo[MBB.getNumber()]; + BBInfo.Exit = TmpStatus; + LLVM_DEBUG(dbgs() << "Initial exit state of " << printMBBReference(MBB) + << " is " << BBInfo.Exit << "\n"); + } + + if (NeedVXRMChange) { + // Phase 2 - propagate the exit state to successors. We add all blocks to + // the list here, but will also add any that need to be revisited during + // Phase 2 processing. + for (const MachineBasicBlock &MBB : MF) { + WorkList.push(&MBB); + BlockInfo[MBB.getNumber()].InQueue = true; + } + while (!WorkList.empty()) { + const MachineBasicBlock &MBB = *WorkList.front(); + WorkList.pop(); + computeIncomingVXRM(MBB); + } + + // Phase 3 - add any VXRM writes needed. + for (MachineBasicBlock &MBB : MF) + emitWriteVXRM(MBB); + } + + BlockInfo.clear(); + + return NeedVXRMChange; +} + +FunctionPass *llvm::createRISCVInsertWriteVXRMPass() { + return new RISCVInsertWriteVXRM(); +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -86,6 +86,7 @@ initializeRISCVExpandPseudoPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); initializeRISCVInsertReadWriteCSRPass(*PR); + initializeRISCVInsertWriteVXRMPass(*PR); initializeRISCVDAGToDAGISelPass(*PR); initializeRISCVInitUndefPass(*PR); initializeRISCVMoveMergePass(*PR); @@ -390,6 +391,7 @@ addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createRISCVInsertVSETVLIPass()); addPass(createRISCVInsertReadWriteCSRPass()); + addPass(createRISCVInsertWriteVXRMPass()); } void RISCVPassConfig::addOptimizedRegAlloc() { diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -42,6 +42,7 @@ ; CHECK-NEXT: RISC-V Pre-RA pseudo instruction expansion pass ; CHECK-NEXT: RISC-V Insert VSETVLI pass ; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass +; CHECK-NEXT: RISC-V Insert Write VXRM Pass ; CHECK-NEXT: RISC-V init undef pass ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -109,6 +109,7 @@ ; CHECK-NEXT: RISC-V Merge Base Offset ; CHECK-NEXT: RISC-V Insert VSETVLI pass ; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass +; CHECK-NEXT: RISC-V Insert Write VXRM Pass ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: RISC-V init undef pass ; CHECK-NEXT: Process Implicit Definitions diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll @@ -0,0 +1,480 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 + +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.riscv.vaadd.nxv1i8.nxv1i8( + , + , + , + iXLen, iXLen); +declare @llvm.riscv.vasub.nxv1i8.nxv1i8( + , + , + , + iXLen, iXLen); + +; Test same rounding mode in one block. +define @test1( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + %b = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %a, + %2, + iXLen 0, iXLen %3) + + ret %b +} + +; Test different rounding mode. +define @test2( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 2 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 2, iXLen %3) + %b = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %a, + %2, + iXLen 0, iXLen %3) + + ret %b +} + +declare @foo() + +; Test same vxrm with call in between which may invalidate vxrm. +define @test3( %0, %1, %2, iXLen %3) nounwind { +; RV32-LABEL: test3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; RV32-NEXT: csrwi vxrm, 0 +; RV32-NEXT: vaadd.vv v8, v8, v9 +; RV32-NEXT: call foo@plt +; RV32-NEXT: vsetvli zero, s0, e8, mf8, ta, ma +; RV32-NEXT: csrwi vxrm, 0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vaadd.vv v8, v8, v9 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: test3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; RV64-NEXT: csrwi vxrm, 0 +; RV64-NEXT: vaadd.vv v8, v8, v9 +; RV64-NEXT: call foo@plt +; RV64-NEXT: vsetvli zero, s0, e8, mf8, ta, ma +; RV64-NEXT: csrwi vxrm, 0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vaadd.vv v8, v8, v9 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + %b = call @foo( %a) + %c = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %b, + %2, + iXLen 0, iXLen %3) + + ret %c +} + +; Test same vxrm with asm in between which may invalidate vxrm. +define @test4( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + %b = call asm "", "=^vr,0"( %a) + %c = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %b, + %2, + iXLen 0, iXLen %3) + + ret %c +} + +; Test same rounding mode in triangle. +define @test5( %0, %1, %2, iXLen %3, i1 %cond) nounwind { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: beqz a1, .LBB4_2 +; CHECK-NEXT: # %bb.1: # %condblock +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: .LBB4_2: # %mergeblock +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br i1 %cond, label %condblock, label %mergeblock + +condblock: + %b = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %a, + %2, + iXLen 0, iXLen %3) + br label %mergeblock + +mergeblock: + %c = phi [%a, %entry], [%b, %condblock] + + ret %c +} + +; Test same rounding mode in diamond with no dominating vxrm. +define @test6( %0, %1, %2, iXLen %3, i1 %cond) nounwind { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: beqz a1, .LBB5_2 +; CHECK-NEXT: # %bb.1: # %trueblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_2: # %falseblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + br i1 %cond, label %trueblock, label %falseblock + +trueblock: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br label %mergeblock + +falseblock: + %b = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %2, + iXLen 0, iXLen %3) + br label %mergeblock + +mergeblock: + %c = phi [%a, %trueblock], [%b, %falseblock] + + ret %c +} + +; Test same rounding mode in diamond with same dominating vxrm. +define @test7( %0, %1, %2, iXLen %3, i1 %cond) nounwind { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: beqz a1, .LBB6_2 +; CHECK-NEXT: # %bb.1: # %trueblock +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_2: # %falseblock +; CHECK-NEXT: vasub.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br i1 %cond, label %trueblock, label %falseblock + +trueblock: + %b = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %a, + %2, + iXLen 0, iXLen %3) + br label %mergeblock + +falseblock: + %c = call @llvm.riscv.vasub.nxv1i8.nxv1i8( + undef, + %a, + %2, + iXLen 0, iXLen %3) + br label %mergeblock + +mergeblock: + %d = phi [%b, %trueblock], [%c, %falseblock] + + ret %d +} + +; Test same rounding mode in diamond with same vxrm at merge. +define @test8( %0, %1, %2, iXLen %3, i1 %cond) nounwind { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: beqz a1, .LBB7_2 +; CHECK-NEXT: # %bb.1: # %trueblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_2: # %falseblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vasub.vv v8, v8, v9 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + br i1 %cond, label %trueblock, label %falseblock + +trueblock: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br label %mergeblock + +falseblock: + %b = call @llvm.riscv.vasub.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br label %mergeblock + +mergeblock: + %c = phi [%a, %trueblock], [%b, %falseblock] + %d = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %c, + %2, + iXLen 0, iXLen %3) + + ret %d +} + +; Test same rounding mode in diamond with different vxrm at merge. +define @test9( %0, %1, %2, iXLen %3, i1 %cond) nounwind { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: beqz a1, .LBB8_2 +; CHECK-NEXT: # %bb.1: # %trueblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: j .LBB8_3 +; CHECK-NEXT: .LBB8_2: # %falseblock +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vasub.vv v8, v8, v9 +; CHECK-NEXT: .LBB8_3: # %mergeblock +; CHECK-NEXT: csrwi vxrm, 2 +; CHECK-NEXT: vaadd.vv v8, v8, v10 +; CHECK-NEXT: ret +entry: + br i1 %cond, label %trueblock, label %falseblock + +trueblock: + %a = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br label %mergeblock + +falseblock: + %b = call @llvm.riscv.vasub.nxv1i8.nxv1i8( + undef, + %0, + %1, + iXLen 0, iXLen %3) + br label %mergeblock + +mergeblock: + %c = phi [%a, %trueblock], [%b, %falseblock] + %d = call @llvm.riscv.vaadd.nxv1i8.nxv1i8( + undef, + %c, + %2, + iXLen 2, iXLen %3) + + ret %d +} + +; Test loop with no dominating vxrm write. +; FIXME: We should hoist vxrm write out of the loop. +define void @test10(i8* nocapture %ptr_dest, i8* nocapture readonly %ptr_op1, i8* nocapture readonly %ptr_op2, iXLen %n) { +; CHECK-LABEL: test10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: beqz a3, .LBB9_2 +; CHECK-NEXT: .LBB9_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a4, a3, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a4, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: csrwi vxrm, 2 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: bnez a3, .LBB9_1 +; CHECK-NEXT: .LBB9_2: # %for.end +; CHECK-NEXT: ret +entry: + %tobool.not9 = icmp eq iXLen %n, 0 + br i1 %tobool.not9, label %for.end, label %for.body + +for.body: + %n.addr.011 = phi iXLen [ %n, %entry ], [ %sub, %for.body ] + %vl = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n.addr.011, iXLen 0, iXLen 2) + %load1 = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op1, iXLen %vl) + %load2 = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op2, iXLen %vl) + %vadd = tail call @llvm.riscv.vaadd.nxv1i8.nxv1i8( undef, %load1, %load2, iXLen 2, iXLen %vl) + tail call void @llvm.riscv.vse.nxv1i8.iXLen( %vadd, ptr %ptr_dest, iXLen %vl) + %sub = sub iXLen %n.addr.011, %vl + %tobool.not = icmp eq iXLen %sub, 0 + br i1 %tobool.not, label %for.end, label %for.body + +for.end: + ret void +} + +declare iXLen @llvm.riscv.vsetvli.iXLen(iXLen, iXLen immarg, iXLen immarg) +declare @llvm.riscv.vle.nxv1i8.iXLen(, * nocapture, iXLen) +declare void @llvm.riscv.vse.nxv1i8.iXLen(, * nocapture, iXLen) + +; Test loop with dominating vxrm write. Make sure there is no write in the loop. +define void @test11(i8* nocapture %ptr_dest, i8* nocapture readonly %ptr_op1, i8* nocapture readonly %ptr_op2, iXLen %n) { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a4, a3, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a4, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: csrwi vxrm, 2 +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: .LBB10_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: beqz a3, .LBB10_3 +; CHECK-NEXT: # %bb.2: # %for.body +; CHECK-NEXT: # in Loop: Header=BB10_1 Depth=1 +; CHECK-NEXT: vsetvli a4, a3, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a4, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: vaadd.vv v8, v8, v9 +; CHECK-NEXT: j .LBB10_1 +; CHECK-NEXT: .LBB10_3: # %for.end +; CHECK-NEXT: ret +entry: + %vl = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n, iXLen 0, iXLen 2) + %load1a = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op1, iXLen %vl) + %load2a = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op2, iXLen %vl) + %vadda = tail call @llvm.riscv.vaadd.nxv1i8.nxv1i8( undef, %load1a, %load2a, iXLen 2, iXLen %vl) + tail call void @llvm.riscv.vse.nxv1i8.iXLen( %vadda, ptr %ptr_dest, iXLen %vl) + %suba = sub iXLen %n, %vl + %tobool.not9 = icmp eq iXLen %suba, 0 + br i1 %tobool.not9, label %for.end, label %for.body + +for.body: + %n.addr.011 = phi iXLen [ %suba, %entry ], [ %sub, %for.body ] + %vl2 = tail call iXLen @llvm.riscv.vsetvli.iXLen(iXLen %n.addr.011, iXLen 0, iXLen 2) + %load1 = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op1, iXLen %vl2) + %load2 = tail call @llvm.riscv.vle.nxv1i8.iXLen( undef, ptr %ptr_op2, iXLen %vl2) + %vadd = tail call @llvm.riscv.vaadd.nxv1i8.nxv1i8( undef, %load1, %load2, iXLen 2, iXLen %vl2) + tail call void @llvm.riscv.vse.nxv1i8.iXLen( %vadd, ptr %ptr_dest, iXLen %vl2) + %sub = sub iXLen %n.addr.011, %vl2 + %tobool.not = icmp eq iXLen %sub, 0 + br i1 %tobool.not, label %for.end, label %for.body + +for.end: + ret void +}