Index: llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
===================================================================
--- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -373,24 +373,6 @@
     return VSETVLIInfo::getUnknown();
   }
 
-  // Calculate the VSETVLIInfo visible at the end of the block assuming this
-  // is the predecessor value, and Other is change for this block.
-  VSETVLIInfo merge(const VSETVLIInfo &Other) const {
-    assert(isValid() && "Can only merge with a valid VSETVLInfo");
-
-    // Nothing changed from the predecessor, keep it.
-    if (!Other.isValid())
-      return *this;
-
-    // If the change is compatible with the input, we won't create a VSETVLI
-    // and should keep the predecessor.
-    if (isCompatible(Other, /*Strict*/ true))
-      return *this;
-
-    // Otherwise just use whatever is in this block.
-    return Other;
-  }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Support for debugging, callable in GDB: V->dump()
   LLVM_DUMP_METHOD void dump() const {
@@ -943,6 +925,7 @@
   bool HadVectorOp = false;
 
   BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+  BBInfo.Change = BBInfo.Pred;
   for (const MachineInstr &MI : MBB) {
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
     if (isVectorConfigInstr(MI)) {
@@ -957,19 +940,18 @@
 
       VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
 
-      if (!BBInfo.Change.isValid()) {
+      // If this instruction isn't compatible with the previous VL/VTYPE
+      // we need to insert a VSETVLI.
+      // If this is a unit-stride or strided load/store, we may be able to use
+      // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+      // NOTE: We only do this if the vtype we're comparing against was
+      // created in this block. We need the first and third phase to treat
+      // the store the same way.
+      if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) &&
+          needVSETVLI(NewInfo, BBInfo.Change)) {
+        LLVM_DEBUG(dbgs() << "Updated to " << NewInfo << " due to " << MI
+                          << " (phase1/2)\n");
         BBInfo.Change = NewInfo;
-      } else {
-        // If this instruction isn't compatible with the previous VL/VTYPE
-        // we need to insert a VSETVLI.
-        // If this is a unit-stride or strided load/store, we may be able to use
-        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
-        // NOTE: We only do this if the vtype we're comparing against was
-        // created in this block. We need the first and third phase to treat
-        // the store the same way.
-        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) &&
-            needVSETVLI(NewInfo, BBInfo.Change))
-          BBInfo.Change = NewInfo;
       }
     }
 
@@ -981,13 +963,11 @@
     }
   }
 
-  // Initial exit state is whatever change we found in the block.
-  BBInfo.Exit = BBInfo.Change;
-
   return HadVectorOp;
 }
 
 void RISCVInsertVSETVLI::computeIncomingVLVTYPE(const MachineBasicBlock &MBB) {
+
   BlockData &BBInfo = BlockInfo[MBB.getNumber()];
 
   BBInfo.InQueue = false;
@@ -1000,10 +980,7 @@
     for (MachineBasicBlock *P : MBB.predecessors())
       InInfo = InInfo.intersect(BlockInfo[P->getNumber()].Exit);
   }
-
-  // If we don't have any valid predecessor value, wait until we do.
-  if (!InInfo.isValid())
-    return;
+  assert(InInfo.isValid());
 
   // If no change, no need to rerun block
   if (InInfo == BBInfo.Pred)
@@ -1013,7 +990,12 @@
   LLVM_DEBUG(dbgs() << "Entry state of " << printMBBReference(MBB)
                     << " changed to " << BBInfo.Pred << "\n");
 
-  VSETVLIInfo TmpStatus = BBInfo.Pred.merge(BBInfo.Change);
+  // Note: It's tempting to cache the state changes here, but due to the
+  // compatibility checks performed a blocks output state can change based on
+  // the input state.  To cache, we'd have to add logic for finding
+  // never-compatible state changes.
+  computeVLVTYPEChanges(MBB);
+  VSETVLIInfo TmpStatus = BBInfo.Change;
 
   // If the new exit value matches the old exit value, we don't need to revisit
   // any blocks.
@@ -1081,10 +1063,12 @@
 }
 
 void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
-  VSETVLIInfo CurInfo;
+  assert(BlockInfo[MBB.getNumber()].Pred.isValid() &&
+         "Expected a valid predecessor state.");
+  VSETVLIInfo CurInfo = BlockInfo[MBB.getNumber()].Pred;
   // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI.
   MachineInstr *PrevVSETVLIMI = nullptr;
-
+  bool IsFirstStateChange = true;
   for (MachineInstr &MI : MBB) {
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
     if (isVectorConfigInstr(MI)) {
@@ -1096,6 +1080,7 @@
       MI.getOperand(4).setIsDead(false);
       CurInfo = getInfoForVSETVLI(MI);
       PrevVSETVLIMI = &MI;
+      IsFirstStateChange = false;
       continue;
     }
 
@@ -1119,60 +1104,59 @@
       MI.addOperand(MachineOperand::CreateReg(RISCV::VTYPE, /*isDef*/ false,
                                               /*isImp*/ true));
 
-      if (!CurInfo.isValid()) {
-        // We haven't found any vector instructions or VL/VTYPE changes yet,
-        // use the predecessor information.
-        assert(BlockInfo[MBB.getNumber()].Pred.isValid() &&
-               "Expected a valid predecessor state.");
-        if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred)) {
-          // If this is the first implicit state change, and the state change
-          // requested can be proven to produce the same register contents, we
-          // can skip emitting the actual state change and continue as if we
-          // had since we know the GPR result of the implicit state change
-          // wouldn't be used and VL/VTYPE registers are correct.  Note that
-          // we *do* need to model the state as if it changed as while the
-          // register contents are unchanged, the abstract model can change.
-          if (needVSETVLIPHI(NewInfo, MBB))
-            insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred);
+      // If this instruction isn't compatible with the previous VL/VTYPE
+      // we need to insert a VSETVLI.
+      // If this is a unit-stride or strided load/store, we may be able to use
+      // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+      // NOTE: We can't use predecessor information for the store. We must
+      // treat it the same as the first phase so that we produce the correct
+      // vl/vtype for succesor blocks.
+      if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
+          needVSETVLI(NewInfo, CurInfo)) {
+
+        // If this is the first implicit state change, and the state change
+        // requested can be proven to produce the same register contents, we
+        // can skip emitting the actual state change and continue as if we
+        // had since we know the GPR result of the implicit state change
+        // wouldn't be used and VL/VTYPE registers are correct.  Note that
+        // we *do* need to model the state as if it changed as while the
+        // register contents are unchanged, the abstract model can change.
+        if (IsFirstStateChange && !needVSETVLIPHI(NewInfo, MBB)) {
           CurInfo = NewInfo;
+          PrevVSETVLIMI = nullptr;
+          IsFirstStateChange = false;
+          continue;
         }
-      } else {
-        // If this instruction isn't compatible with the previous VL/VTYPE
-        // we need to insert a VSETVLI.
-        // If this is a unit-stride or strided load/store, we may be able to use
-        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
-        // NOTE: We can't use predecessor information for the store. We must
-        // treat it the same as the first phase so that we produce the correct
-        // vl/vtype for succesor blocks.
-        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
-            needVSETVLI(NewInfo, CurInfo)) {
-          // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
-          // with current VL/VTYPE.
-          bool NeedInsertVSETVLI = true;
-          if (PrevVSETVLIMI) {
-            bool HasSameAVL =
-                CurInfo.hasSameAVL(NewInfo) ||
-                (NewInfo.hasAVLReg() && NewInfo.getAVLReg().isVirtual() &&
-                 NewInfo.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg());
-            // If these two VSETVLI have the same AVL and the same VLMAX,
-            // we could merge these two VSETVLI.
-            if (HasSameAVL &&
-                CurInfo.getSEWLMULRatio() == NewInfo.getSEWLMULRatio()) {
-              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
-              NeedInsertVSETVLI = false;
-            }
-            if (isScalarMoveInstr(MI) &&
-                ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) ||
-                 (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) &&
-                NewInfo.hasSameVLMAX(CurInfo)) {
-              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
-              NeedInsertVSETVLI = false;
-            }
+
+        // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
+        // with current VL/VTYPE.
+        bool NeedInsertVSETVLI = true;
+        if (PrevVSETVLIMI) {
+          bool HasSameAVL =
+            CurInfo.hasSameAVL(NewInfo) ||
+            (NewInfo.hasAVLReg() && NewInfo.getAVLReg().isVirtual() &&
+             NewInfo.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg());
+          // If these two VSETVLI have the same AVL and the same VLMAX,
+          // we could merge these two VSETVLI.
+          if (HasSameAVL &&
+              CurInfo.getSEWLMULRatio() == NewInfo.getSEWLMULRatio()) {
+            PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
+            NeedInsertVSETVLI = false;
+          }
+          if (isScalarMoveInstr(MI) &&
+              ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) ||
+               (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) &&
+              NewInfo.hasSameVLMAX(CurInfo)) {
+            PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
+            NeedInsertVSETVLI = false;
           }
-          if (NeedInsertVSETVLI)
-            insertVSETVLI(MBB, MI, NewInfo, CurInfo);
-          CurInfo = NewInfo;
         }
+        if (NeedInsertVSETVLI)
+          insertVSETVLI(MBB, MI, NewInfo, CurInfo);
+        LLVM_DEBUG(dbgs() << "Updated to " << NewInfo << " due to " << MI
+                   << " (phase3)\n");
+        CurInfo = NewInfo;
+        IsFirstStateChange = false;
       }
       PrevVSETVLIMI = nullptr;
     }
@@ -1182,6 +1166,7 @@
     if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
         MI.modifiesRegister(RISCV::VTYPE)) {
       CurInfo = VSETVLIInfo::getUnknown();
+      IsFirstStateChange = false;
       PrevVSETVLIMI = nullptr;
     }
 
@@ -1192,6 +1177,7 @@
       if (CurInfo.isValid() && ExitInfo.isValid() && !ExitInfo.isUnknown() &&
           CurInfo != ExitInfo) {
         insertVSETVLI(MBB, MI, ExitInfo, CurInfo);
+        IsFirstStateChange = false;
         CurInfo = ExitInfo;
       }
     }
@@ -1227,8 +1213,13 @@
   bool HaveVectorOp = false;
 
   // Phase 1 - determine how VL/VTYPE are affected by the each block.
-  for (const MachineBasicBlock &MBB : MF)
+  for (const MachineBasicBlock &MBB : MF) {
+    BlockData &BBInfo = BlockInfo[MBB.getNumber()];
+    BBInfo.Pred = VSETVLIInfo::getUnknown();
     HaveVectorOp |= computeVLVTYPEChanges(MBB);
+    // Initial exit state is whatever change we found in the block.
+    BBInfo.Exit = BBInfo.Change;
+  }
 
   // If we didn't find any instructions that need VSETVLI, we're done.
   if (!HaveVectorOp) {
Index: llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
===================================================================
--- llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -505,8 +505,8 @@
 ; CHECK-NEXT:    andi a0, a3, 1
 ; CHECK-NEXT:    beqz a0, .LBB9_2
 ; CHECK-NEXT:  # %bb.1: # %if
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v10
 ; CHECK-NEXT:  .LBB9_2: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -544,8 +544,8 @@
 ; CHECK-NEXT:    andi a0, a4, 1
 ; CHECK-NEXT:    beqz a0, .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %if
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vwadd.wv v9, v9, v10
 ; CHECK-NEXT:  .LBB10_2: # %if.end
 ; CHECK-NEXT:    andi a0, a5, 1
Index: llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
===================================================================
--- llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -600,10 +600,8 @@
   ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:gpr = MUL [[PHI]], [[SRLI]]
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[MUL]]
-  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 87 /* e32, mf2, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   PseudoVSE32_V_MF2 killed [[PseudoVADD_VX_M1_]], killed [[ADD]], -1, 5 /* e32 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI [[PHI]], 1
-  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   BLTU [[ADDI]], [[COPY1]], %bb.1
   ; CHECK-NEXT:   PseudoBR %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -744,9 +742,6 @@
 
 ...
 ---
-# FIXME: This test shows incorrect VSETVLI insertion. The VLUXEI64 needs
-# configuration for SEW=8 but it instead inherits a SEW=64 from the entry
-# block.
 name:            vsetvli_vluxei64_regression
 tracksRegLiveness: true
 body:             |
@@ -779,6 +774,7 @@
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $v0 = COPY %mask
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   early-clobber %t0:vrnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, killed %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   %ldval:vr = COPY %t0
   ; CHECK-NEXT:   PseudoBR %bb.3
@@ -786,6 +782,7 @@
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   %stval:vr = PHI %t4, %bb.1, %ldval, %bb.2
   ; CHECK-NEXT:   $v0 = COPY %mask
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   PseudoVSOXEI64_V_M1_MF8_MASK killed %stval, killed %b, %idxs, $v0, -1, 3 /* e8 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoRET
   bb.0:
@@ -865,8 +862,8 @@
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:gpr = ADD %src, [[PHI]]
-  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   [[PseudoVLE8_V_MF8_:%[0-9]+]]:vrnov0 = PseudoVLE8_V_MF8 killed [[ADD1]], -1, 3 /* e8 */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   [[PseudoVADD_VI_MF8_:%[0-9]+]]:vrnov0 = PseudoVADD_VI_MF8 [[PseudoVLE8_V_MF8_]], 4, -1, 3 /* e8 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:gpr = ADD %dst, [[PHI]]
   ; CHECK-NEXT:   PseudoVSE8_V_MF8 killed [[PseudoVADD_VI_MF8_]], killed [[ADD2]], -1, 3 /* e8 */, implicit $vl, implicit $vtype