diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -241,14 +241,12 @@ bool isCompatible(const VSETVLIInfo &InstrInfo, bool Strict) const { assert(isValid() && InstrInfo.isValid() && "Can't compare invalid VSETVLIInfos"); - assert(!InstrInfo.SEWLMULRatioOnly && - "Expected a valid VTYPE for instruction!"); // Nothing is compatible with Unknown. if (isUnknown() || InstrInfo.isUnknown()) return false; // If only our VLMAX ratio is valid, then this isn't compatible. - if (SEWLMULRatioOnly) + if (SEWLMULRatioOnly || InstrInfo.SEWLMULRatioOnly) return false; // If the instruction doesn't need an AVLReg and the SEW matches, consider @@ -450,6 +448,16 @@ BlockData() = default; }; +// The different kinds of VSETVLI insertion we support. +enum VSETVLIInsertionKind { + // No VSETVLI required (e.g., compatible with previous) + None, + // A new VSETVLI is required + Required, + // A previous VSETVLI may safely be mutated. + MutatePrevious, +}; + class RISCVInsertVSETVLI : public MachineFunctionPass { const TargetInstrInfo *TII; MachineRegisterInfo *MRI; @@ -474,6 +482,10 @@ private: bool needVSETVLI(const VSETVLIInfo &Require, const VSETVLIInfo &CurInfo); + VSETVLIInsertionKind needVSETVLI(const MachineInstr &MI, + const VSETVLIInfo &Require, + const VSETVLIInfo &CurInfo, + const MachineInstr *PrevVSETVLIMI = nullptr); bool needVSETVLIPHI(const VSETVLIInfo &Require, const MachineBasicBlock &MBB); void insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI, const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo); @@ -939,14 +951,51 @@ return CurInfo.isCompatibleWithLoadStoreEEW(EEW, Require); } +VSETVLIInsertionKind RISCVInsertVSETVLI::needVSETVLI( + const MachineInstr &MI, const VSETVLIInfo &Require, + const VSETVLIInfo &CurInfo, const MachineInstr *PrevVSETVLIMI) { + if (!needVSETVLI(Require, CurInfo)) + return VSETVLIInsertionKind::None; + + // If this is a unit-stride or strided load/store, we may be able to use + // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype. + if (canSkipVSETVLIForLoadStore(MI, Require, CurInfo)) + return VSETVLIInsertionKind::None; + + // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it + // with current VL/VTYPE. + if (PrevVSETVLIMI) { + bool HasSameAVL = + CurInfo.hasSameAVL(Require) || + (Require.hasAVLReg() && Require.getAVLReg().isVirtual() && + Require.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg()); + // If these two VSETVLI have the same AVL and the same VLMAX, + // we could merge these two VSETVLI. + if (HasSameAVL && CurInfo.getSEWLMULRatio() == Require.getSEWLMULRatio()) + return VSETVLIInsertionKind::MutatePrevious; + + if (isScalarMoveInstr(MI) && + ((CurInfo.hasNonZeroAVL() && Require.hasNonZeroAVL()) || + (CurInfo.hasZeroAVL() && Require.hasZeroAVL())) && + Require.hasSameVLMAX(CurInfo)) + return VSETVLIInsertionKind::MutatePrevious; + } + + return VSETVLIInsertionKind::Required; +} + bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) { bool HadVectorOp = false; + // Only set if current VSETVLIInfo is from an explicit VSET(I)VLI. + const MachineInstr *PrevVSETVLIMI = nullptr; + BlockData &BBInfo = BlockInfo[MBB.getNumber()]; for (const MachineInstr &MI : MBB) { // If this is an explicit VSETVLI or VSETIVLI, update our state. if (isVectorConfigInstr(MI)) { HadVectorOp = true; + PrevVSETVLIMI = &MI; BBInfo.Change = getInfoForVSETVLI(MI); continue; } @@ -962,15 +1011,15 @@ } else { // If this instruction isn't compatible with the previous VL/VTYPE // we need to insert a VSETVLI. - // If this is a unit-stride or strided load/store, we may be able to use - // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype. // NOTE: We only do this if the vtype we're comparing against was // created in this block. We need the first and third phase to treat // the store the same way. - if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) && - needVSETVLI(NewInfo, BBInfo.Change)) + if (needVSETVLI(MI, NewInfo, BBInfo.Change, PrevVSETVLIMI) != + VSETVLIInsertionKind::None) { BBInfo.Change = NewInfo; + } } + PrevVSETVLIMI = nullptr; } // If this is something that updates VL/VTYPE that we don't know about, set @@ -978,6 +1027,7 @@ if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) || MI.modifiesRegister(RISCV::VTYPE)) { BBInfo.Change = VSETVLIInfo::getUnknown(); + PrevVSETVLIMI = nullptr; } } @@ -1013,14 +1063,22 @@ LLVM_DEBUG(dbgs() << "Entry state of " << printMBBReference(MBB) << " changed to " << BBInfo.Pred << "\n"); - VSETVLIInfo TmpStatus = BBInfo.Pred.merge(BBInfo.Change); + // Cache the exit state before recomputation. + // FIXME: We can drop 'Change' and just use 'Exit'. + // FIXME: We shouldn't have to set 'Change' as an input + auto CachedExit = BBInfo.Exit; + + // Now that we've computed the info for predecessors, recompute the VTYPE + // changes on this block. The predecessors may have changed the incoming + // vtype and we must be in sync with phase 3. + BBInfo.Change = InInfo; + computeVLVTYPEChanges(MBB); // If the new exit value matches the old exit value, we don't need to revisit // any blocks. - if (BBInfo.Exit == TmpStatus) + if (CachedExit == BBInfo.Exit) return; - BBInfo.Exit = TmpStatus; LLVM_DEBUG(dbgs() << "Exit state of " << printMBBReference(MBB) << " changed to " << BBInfo.Exit << "\n"); @@ -1081,8 +1139,10 @@ } void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { - VSETVLIInfo CurInfo; - // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI. + BlockData &BBInfo = BlockInfo[MBB.getNumber()]; + VSETVLIInfo CurInfo = BBInfo.Pred; + + // Only set if current VSETVLIInfo is from an explicit VSET(I)VLI. MachineInstr *PrevVSETVLIMI = nullptr; for (MachineInstr &MI : MBB) { @@ -1124,46 +1184,33 @@ // use the predecessor information. assert(BlockInfo[MBB.getNumber()].Pred.isValid() && "Expected a valid predecessor state."); - if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) && - needVSETVLIPHI(NewInfo, MBB)) { - insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred); + if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred)) { + if (needVSETVLIPHI(NewInfo, MBB)) { + insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred); + } else { + // IF this is the first implicit state change, and the state change + // requested can be proven to produce the same register contents, we + // can skip emitting the actual state change and continue as if we + // had since we know the GPR result of the implicit state change + // wouldn't be used and VL/VTYPE registers are correct. Note that + // we *do* need to model the state as if it changed as while the + // register contents are unchanged, the abstract model can change. + } + PrevVSETVLIMI = nullptr; CurInfo = NewInfo; } } else { // If this instruction isn't compatible with the previous VL/VTYPE // we need to insert a VSETVLI. - // If this is a unit-stride or strided load/store, we may be able to use - // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype. // NOTE: We can't use predecessor information for the store. We must // treat it the same as the first phase so that we produce the correct - // vl/vtype for succesor blocks. - if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) && - needVSETVLI(NewInfo, CurInfo)) { - // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it - // with current VL/VTYPE. - bool NeedInsertVSETVLI = true; - if (PrevVSETVLIMI) { - bool HasSameAVL = - CurInfo.hasSameAVL(NewInfo) || - (NewInfo.hasAVLReg() && NewInfo.getAVLReg().isVirtual() && - NewInfo.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg()); - // If these two VSETVLI have the same AVL and the same VLMAX, - // we could merge these two VSETVLI. - if (HasSameAVL && - CurInfo.getSEWLMULRatio() == NewInfo.getSEWLMULRatio()) { - PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE()); - NeedInsertVSETVLI = false; - } - if (isScalarMoveInstr(MI) && - ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) || - (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) && - NewInfo.hasSameVLMAX(CurInfo)) { - PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE()); - NeedInsertVSETVLI = false; - } - } - if (NeedInsertVSETVLI) + // vl/vtype for successor blocks. + auto Kind = needVSETVLI(MI, NewInfo, CurInfo, PrevVSETVLIMI); + if (Kind != VSETVLIInsertionKind::None) { + if (Kind == VSETVLIInsertionKind::Required) insertVSETVLI(MBB, MI, NewInfo, CurInfo); + else if (Kind == VSETVLIInsertionKind::MutatePrevious) + PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE()); CurInfo = NewInfo; } } diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -90,11 +90,12 @@ ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: vsetvli a0, a0, e64, m1, ta, mu ; CHECK-NEXT: vfadd.vv v9, v8, v9 -; CHECK-NEXT: vfmul.vv v8, v9, v8 -; CHECK-NEXT: ret +; CHECK-NEXT: j .LBB2_3 ; CHECK-NEXT: .LBB2_2: # %if.else ; CHECK-NEXT: vsetvli a0, a0, e64, m1, ta, mu ; CHECK-NEXT: vfsub.vv v9, v8, v9 +; CHECK-NEXT: .LBB2_3: # %if.end +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vfmul.vv v8, v9, v8 ; CHECK-NEXT: ret entry: @@ -449,11 +450,12 @@ ; CHECK-NEXT: beqz a3, .LBB8_2 ; CHECK-NEXT: .LBB8_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: slli a4, a3, 2 ; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, tu, mu +; CHECK-NEXT: vsetvli zero, zero, e32, m8, tu, mu ; CHECK-NEXT: vfmacc.vf v16, fa0, v8 ; CHECK-NEXT: vse32.v v16, (a2) ; CHECK-NEXT: sub a0, a0, a3 @@ -505,8 +507,8 @@ ; CHECK-NEXT: andi a0, a3, 1 ; CHECK-NEXT: beqz a0, .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vwcvt.x.x.v v8, v10 ; CHECK-NEXT: .LBB9_2: # %if.end ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -544,8 +546,8 @@ ; CHECK-NEXT: andi a0, a4, 1 ; CHECK-NEXT: beqz a0, .LBB10_2 ; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vwadd.wv v9, v9, v10 ; CHECK-NEXT: .LBB10_2: # %if.end ; CHECK-NEXT: andi a0, a5, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir @@ -600,10 +600,8 @@ ; CHECK-NEXT: [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[MUL:%[0-9]+]]:gpr = MUL [[PHI]], [[SRLI]] ; CHECK-NEXT: [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[MUL]] - ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 87 /* e32, mf2, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: PseudoVSE32_V_MF2 killed [[PseudoVADD_VX_M1_]], killed [[ADD]], -1, 5 /* e32 */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[PHI]], 1 - ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: BLTU [[ADDI]], [[COPY1]], %bb.1 ; CHECK-NEXT: PseudoBR %bb.2 ; CHECK-NEXT: {{ $}} @@ -744,9 +742,6 @@ ... --- -# FIXME: This test shows incorrect VSETVLI insertion. The VLUXEI64 needs -# configuration for SEW=8 but it instead inherits a SEW=64 from the entry -# block. name: vsetvli_vluxei64_regression tracksRegLiveness: true body: | @@ -779,6 +774,7 @@ ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: early-clobber %t0:vrnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, killed %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1, implicit $vl, implicit $vtype ; CHECK-NEXT: %ldval:vr = COPY %t0 ; CHECK-NEXT: PseudoBR %bb.3 @@ -786,6 +782,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: %stval:vr = PHI %t4, %bb.1, %ldval, %bb.2 ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: PseudoVSOXEI64_V_M1_MF8_MASK killed %stval, killed %b, %idxs, $v0, -1, 3 /* e8 */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoRET bb.0: @@ -865,8 +862,8 @@ ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ADD1:%[0-9]+]]:gpr = ADD %src, [[PHI]] - ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: [[PseudoVLE8_V_MF8_:%[0-9]+]]:vrnov0 = PseudoVLE8_V_MF8 killed [[ADD1]], -1, 3 /* e8 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl ; CHECK-NEXT: [[PseudoVADD_VI_MF8_:%[0-9]+]]:vrnov0 = PseudoVADD_VI_MF8 [[PseudoVLE8_V_MF8_]], 4, -1, 3 /* e8 */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADD2:%[0-9]+]]:gpr = ADD %dst, [[PHI]] ; CHECK-NEXT: PseudoVSE8_V_MF8 killed [[PseudoVADD_VI_MF8_]], killed [[ADD2]], -1, 3 /* e8 */, implicit $vl, implicit $vtype diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -102,23 +102,24 @@ define void @test6(i32* nocapture readonly %A, i32* nocapture %B, i64 %n) { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli a6, a2, e32, m1, ta, mu -; CHECK-NEXT: beqz a6, .LBB5_3 +; CHECK-NEXT: vsetvli a3, a2, e32, m1, ta, mu +; CHECK-NEXT: beqz a3, .LBB5_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: .LBB5_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: slli a5, a4, 2 -; CHECK-NEXT: add a3, a0, a5 -; CHECK-NEXT: vle32.v v8, (a3) +; CHECK-NEXT: add a6, a0, a5 +; CHECK-NEXT: vsetvli zero, a3, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a6) ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 -; CHECK-NEXT: add a3, a1, a5 -; CHECK-NEXT: vse32.v v8, (a3), v0.t -; CHECK-NEXT: add a4, a4, a6 -; CHECK-NEXT: vsetvli a6, a2, e32, m1, ta, mu -; CHECK-NEXT: bnez a6, .LBB5_2 +; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: vse32.v v8, (a5), v0.t +; CHECK-NEXT: add a4, a4, a3 +; CHECK-NEXT: vsetvli a3, a2, e32, m1, ta, mu +; CHECK-NEXT: bnez a3, .LBB5_2 ; CHECK-NEXT: .LBB5_3: # %for.cond.cleanup ; CHECK-NEXT: ret entry: