diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -108,6 +108,34 @@ } } +static bool isCompressInstr(const MachineInstr &MI) { + return getRVVMCOpcode(MI.getOpcode()) == RISCV::VCOMPRESS_VM; +} + +static bool isReductionInstr(const MachineInstr &MI) { + switch (getRVVMCOpcode(MI.getOpcode())) { + default: + return false; + case RISCV::VREDSUM_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMAX_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDAND_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDXOR_VS: + case RISCV::VWREDSUMU_VS: + case RISCV::VWREDSUM_VS: + case RISCV::VFREDOSUM_VS: + case RISCV::VFREDUSUM_VS: + case RISCV::VFREDMIN_VS: + case RISCV::VFREDMAX_VS: + case RISCV::VFWREDOSUM_VS: + case RISCV::VFWREDUSUM_VS: + return true; + } +} + /// Get the EEW for a load or store instruction. Return std::nullopt if MI is /// not a load or store which ignores SEW. static std::optional getEEWForLoadStore(const MachineInstr &MI) { @@ -934,6 +962,26 @@ Used.TailPolicy = false; } + // For most instructions, tail element is defined as: + // tail(x) = (vl <= x < max(VLMAX,VLEN/SEW)) + // So if the avl is VLMAX, and LMUL is not fractional, there is no tail + // element, so it doesn't need tail policy. + if (!isScalarMoveInstr(MI) && !isReductionInstr(MI) && !isCompressInstr(MI)) + if (RISCVII::hasVLOp(MI.getDesc().TSFlags)) { + RISCVII::VLMUL VLMul = RISCVII::getLMul(MI.getDesc().TSFlags); + // Fractional LMULs always require tail policy. + if (VLMul < RISCVII::LMUL_RESERVED) { + const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI)); + if (VLOp.isImm()) { + int64_t Imm = VLOp.getImm(); + if (Imm == RISCV::VLMaxSentinel) + Used.TailPolicy = false; + } else if (VLOp.getReg() == RISCV::X0) { + Used.TailPolicy = false; + } + } + } + // A tail undefined vmv.v.i/x or vfmv.v.f with VL=1 can be treated in the same // semantically as vmv.s.x. This is particularly useful since we don't have an // immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place. diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -385,7 +385,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfadd.vv v9, v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call @llvm.experimental.constrained.fadd( %x, %y, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -579,6 +579,32 @@ ret %2 } +define @test21( %a, %b, %mask) nounwind { +; CHECK-LABEL: test21: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %x = call @llvm.riscv.vadd.mask.nxv1i64.nxv1i64( + %a, + %a, + %a, + %mask, + i64 -1, + i64 0) + %y = call @llvm.riscv.vadd.mask.nxv1i64.nxv1i64( + %b, + %b, + %x, + %mask, + i64 -1, + i64 1) + ret %y +} + ; This used to fail the machine verifier due to the vsetvli being removed ; while the add was still using it. define i64 @bad_removal(<2 x i64> %arg) {