diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7526,7 +7526,7 @@
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
@@ -7578,7 +7578,7 @@
                                             ShuffleOps[1], Mask, DAG);
   if (!Shuffle)
     return SDValue();
-  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
 }
 
 enum ShuffleOpCodes {
@@ -12952,6 +12952,28 @@
   return SDValue();
 }
 
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+  if (ST->isLittle())
+    return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformVCMPCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -14787,6 +14809,8 @@
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VECTOR_REG_CAST:
+    return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
   case ARMISD::VCMP:
     return PerformVCMPCombine(N, DCI, Subtarget);
   case ARMISD::SMULWB: {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -294,6 +294,21 @@
 def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
 
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3990,21 +3990,6 @@
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
 
-// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
-// vector register as a different vector type, without changing the contents of
-// the register. It differs from 'bitconvert' in that bitconvert reinterprets
-// the _memory_ storage format of the vector, whereas vector_reg_cast
-// reinterprets the _register_ format - and in big-endian, the memory and
-// register formats are different, so they are different operations.
-//
-// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
-// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
-// whereas 'bitconvert' will map it to the high byte in big-endian mode,
-// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
-// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
-// no code at all if the vector is already in a register.
-def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
-
 let Predicates = [HasMVEInt] in {
   foreach VT = [ v4i1, v8i1, v16i1 ] in {
     def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@@ -4019,7 +4004,7 @@
 
   foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
     foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
-      def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
+      def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
 }
 
 // end of MVE compares
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -7531,6 +7531,16 @@
   def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
 }
 
+let Predicates = [HasNEON] in {
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
+
+  foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+    foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
 // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
 let Predicates = [IsBE,HasNEON] in {
 def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
diff --git a/llvm/test/CodeGen/ARM/neon-vmovn.ll b/llvm/test/CodeGen/ARM/neon-vmovn.ll
--- a/llvm/test/CodeGen/ARM/neon-vmovn.ll
+++ b/llvm/test/CodeGen/ARM/neon-vmovn.ll
@@ -732,8 +732,8 @@
 ;
 ; CHECKBE-LABEL: test:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q8, q1
-; CHECKBE-NEXT:    vrev64.8 q9, q0
+; CHECKBE-NEXT:    vrev64.16 q8, q1
+; CHECKBE-NEXT:    vrev64.16 q9, q0
 ; CHECKBE-NEXT:    vtrn.8 q9, q8
 ; CHECKBE-NEXT:    vrev64.8 q0, q9
 ; CHECKBE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -10,8 +10,8 @@
 ;
 ; CHECKBE-LABEL: vmovn32_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vrev64.32 q2, q1
+; CHECKBE-NEXT:    vrev64.32 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i32 q1, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -30,8 +30,8 @@
 ;
 ; CHECKBE-LABEL: vmovn32_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q0
-; CHECKBE-NEXT:    vrev64.16 q3, q1
+; CHECKBE-NEXT:    vrev64.32 q2, q0
+; CHECKBE-NEXT:    vrev64.32 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i32 q3, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q3
 ; CHECKBE-NEXT:    bx lr
@@ -49,8 +49,8 @@
 ;
 ; CHECKBE-LABEL: vmovn16_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q1
-; CHECKBE-NEXT:    vrev64.8 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q2, q1
+; CHECKBE-NEXT:    vrev64.16 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i16 q1, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -69,8 +69,8 @@
 ;
 ; CHECKBE-LABEL: vmovn16_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q0
-; CHECKBE-NEXT:    vrev64.8 q3, q1
+; CHECKBE-NEXT:    vrev64.16 q2, q0
+; CHECKBE-NEXT:    vrev64.16 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i16 q3, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q3
 ; CHECKBE-NEXT:    bx lr