diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -108,6 +108,34 @@
   }
 }
 
+static bool isCompressInstr(const MachineInstr &MI) {
+  return getRVVMCOpcode(MI.getOpcode()) == RISCV::VCOMPRESS_VM;
+}
+
+static bool isReductionInstr(const MachineInstr &MI) {
+  switch (getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VWREDSUMU_VS:
+  case RISCV::VWREDSUM_VS:
+  case RISCV::VFREDOSUM_VS:
+  case RISCV::VFREDUSUM_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFWREDOSUM_VS:
+  case RISCV::VFWREDUSUM_VS:
+    return true;
+  }
+}
+
 /// Get the EEW for a load or store instruction.  Return std::nullopt if MI is
 /// not a load or store which ignores SEW.
 static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
@@ -934,6 +962,26 @@
     Used.TailPolicy = false;
   }
 
+  // For most instructions, tail element is defined as:
+  // tail(x) = (vl <= x < max(VLMAX,VLEN/SEW))
+  // So if the avl is VLMAX, and LMUL is not fractional, there is no tail
+  // element, so it doesn't need tail policy.
+  if (!isScalarMoveInstr(MI) && !isReductionInstr(MI) && !isCompressInstr(MI))
+    if (RISCVII::hasVLOp(MI.getDesc().TSFlags)) {
+      RISCVII::VLMUL VLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+      // Fractional LMULs always require tail policy.
+      if (VLMul < RISCVII::LMUL_RESERVED) {
+        const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
+        if (VLOp.isImm()) {
+          int64_t Imm = VLOp.getImm();
+          if (Imm == RISCV::VLMaxSentinel)
+            Used.TailPolicy = false;
+        } else if (VLOp.getReg() == RISCV::X0) {
+          Used.TailPolicy = false;
+        }
+      }
+    }
+
   // A tail undefined vmv.v.i/x or vfmv.v.f with VL=1 can be treated in the same
   // semantically as vmv.s.x.  This is particularly useful since we don't have an
   // immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place.
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -385,7 +385,6 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.experimental.constrained.fadd(<vscale x 2 x float> %x, <vscale x 2 x float> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -579,6 +579,32 @@
   ret <vscale x 1 x double> %2
 }
 
+define <vscale x 1 x i64> @test21(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %mask) nounwind {
+; CHECK-LABEL: test21:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, tu, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    vadd.vv v9, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %x = call <vscale x 1 x i64> @llvm.riscv.vadd.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %a,
+    <vscale x 1 x i64> %a,
+    <vscale x 1 x i64> %a,
+    <vscale x 1 x i1> %mask,
+    i64 -1,
+    i64 0)
+  %y = call <vscale x 1 x i64> @llvm.riscv.vadd.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %b,
+    <vscale x 1 x i64> %b,
+    <vscale x 1 x i64> %x,
+    <vscale x 1 x i1> %mask,
+    i64 -1,
+    i64 1)
+  ret <vscale x 1 x i64> %y
+}
+
 ; This used to fail the machine verifier due to the vsetvli being removed
 ; while the add was still using it.
 define i64 @bad_removal(<2 x i64> %arg) {