diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -366,7 +366,8 @@
 
   bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
     return MI->getOpcode() == ARM::t2LoopEndDec ||
-           MI->getOpcode() == ARM::t2DoLoopStartTP;
+           MI->getOpcode() == ARM::t2DoLoopStartTP ||
+           MI->getOpcode() == ARM::t2WhileLoopStartLR;
   }
 
 private:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -52,284 +52,291 @@
   namespace ARMISD {
 
     // ARM Specific DAG Nodes
-    enum NodeType : unsigned {
-      // Start the numbering where the builtin ops and target ops leave off.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
-                    // TargetExternalSymbol, and TargetGlobalAddress.
-      WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
-                    // PIC mode.
-      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
-
-      // Add pseudo op to model memcpy for struct byval.
-      COPY_STRUCT_BYVAL,
-
-      CALL,         // Function call.
-      CALL_PRED,    // Function call that's predicable.
-      CALL_NOLINK,  // Function call with branch not branch-and-link.
-      tSECALL,      // CMSE non-secure function call.
-      BRCOND,       // Conditional branch.
-      BR_JT,        // Jumptable branch.
-      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
-      RET_FLAG,     // Return with a flag operand.
-      SERET_FLAG,   // CMSE Entry function return with a flag operand.
-      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
-
-      PIC_ADD,      // Add with a PC operand and a PIC label.
-
-      ASRL,         // MVE long arithmetic shift right.
-      LSRL,         // MVE long shift right.
-      LSLL,         // MVE long shift left.
-
-      CMP,          // ARM compare instructions.
-      CMN,          // ARM CMN instructions.
-      CMPZ,         // ARM compare that sets only Z flag.
-      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
-      CMPFPE,       // ARM VFP signalling compare instruction, sets FPSCR.
-      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
-      CMPFPEw0,     // ARM VFP signalling compare against zero instruction, sets FPSCR.
-      FMSTAT,       // ARM fmstat instruction.
-
-      CMOV,         // ARM conditional move instructions.
-      SUBS,         // Flag-setting subtraction.
-
-      SSAT,         // Signed saturation
-      USAT,         // Unsigned saturation
-
-      BCC_i64,
-
-      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
-      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
-      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
-
-      ADDC,         // Add with carry
-      ADDE,         // Add using carry
-      SUBC,         // Sub with carry
-      SUBE,         // Sub using carry
-      LSLS,         // Shift left producing carry
-
-      VMOVRRD,      // double to two gprs.
-      VMOVDRR,      // Two gprs to double.
-      VMOVSR,       // move gpr to single, used for f32 literal constructed in a gpr
-
-      EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
-      EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
-      EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
-
-      TC_RETURN,    // Tail call return pseudo.
-
-      THREAD_POINTER,
-
-      DYN_ALLOC,    // Dynamic allocation on the stack.
-
-      MEMBARRIER_MCR, // Memory barrier (MCR)
-
-      PRELOAD,      // Preload
-
-      WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
-      WIN__DBZCHK,  // Windows' divide by zero check
-
-      WLS,          // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart
-      WLSSETUP,     // Setup for the iteration count of a WLS. See t2WhileLoopSetup.
-      LOOP_DEC,     // Really a part of LE, performs the sub
-      LE,           // Low-overhead loops, Loop End
-
-      PREDICATE_CAST, // Predicate cast for MVE i1 types
-      VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
-
-      VCMP,         // Vector compare.
-      VCMPZ,        // Vector compare to zero.
-      VTST,         // Vector test bits.
-
-      // Vector shift by vector
-      VSHLs,        // ...left/right by signed
-      VSHLu,        // ...left/right by unsigned
-
-      // Vector shift by immediate:
-      VSHLIMM,      // ...left
-      VSHRsIMM,     // ...right (signed)
-      VSHRuIMM,     // ...right (unsigned)
-
-      // Vector rounding shift by immediate:
-      VRSHRsIMM,    // ...right (signed)
-      VRSHRuIMM,    // ...right (unsigned)
-      VRSHRNIMM,    // ...right narrow
-
-      // Vector saturating shift by immediate:
-      VQSHLsIMM,    // ...left (signed)
-      VQSHLuIMM,    // ...left (unsigned)
-      VQSHLsuIMM,   // ...left (signed to unsigned)
-      VQSHRNsIMM,   // ...right narrow (signed)
-      VQSHRNuIMM,   // ...right narrow (unsigned)
-      VQSHRNsuIMM,  // ...right narrow (signed to unsigned)
-
-      // Vector saturating rounding shift by immediate:
-      VQRSHRNsIMM,  // ...right narrow (signed)
-      VQRSHRNuIMM,  // ...right narrow (unsigned)
-      VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
-
-      // Vector shift and insert:
-      VSLIIMM,      // ...left
-      VSRIIMM,      // ...right
-
-      // Vector get lane (VMOV scalar to ARM core register)
-      // (These are used for 8- and 16-bit element types only.)
-      VGETLANEu,    // zero-extend vector extract element
-      VGETLANEs,    // sign-extend vector extract element
-
-      // Vector move immediate and move negated immediate:
-      VMOVIMM,
-      VMVNIMM,
-
-      // Vector move f32 immediate:
-      VMOVFPIMM,
-
-      // Move H <-> R, clearing top 16 bits
-      VMOVrh,
-      VMOVhr,
-
-      // Vector duplicate:
-      VDUP,
-      VDUPLANE,
-
-      // Vector shuffles:
-      VEXT,         // extract
-      VREV64,       // reverse elements within 64-bit doublewords
-      VREV32,       // reverse elements within 32-bit words
-      VREV16,       // reverse elements within 16-bit halfwords
-      VZIP,         // zip (interleave)
-      VUZP,         // unzip (deinterleave)
-      VTRN,         // transpose
-      VTBL1,        // 1-register shuffle with mask
-      VTBL2,        // 2-register shuffle with mask
-      VMOVN,        // MVE vmovn
-
-      // MVE Saturating truncates
-      VQMOVNs,      // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
-      VQMOVNu,      // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
-
-      // MVE float <> half converts
-      VCVTN,        // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
-      VCVTL,        // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
-
-      // Vector multiply long:
-      VMULLs,       // ...signed
-      VMULLu,       // ...unsigned
-
-      VQDMULH,      // MVE vqdmulh instruction
-
-      // MVE reductions
-      VADDVs,       // sign- or zero-extend the elements of a vector to i32,
-      VADDVu,       //   add them all together, and return an i32 of their sum
-      VADDVps,      // Same as VADDV[su] but with a v4i1 predicate mask
-      VADDVpu,
-      VADDLVs,      // sign- or zero-extend elements to i64 and sum, returning
-      VADDLVu,      //   the low and high 32-bit halves of the sum
-      VADDLVAs,     // Same as VADDLV[su] but also add an input accumulator
-      VADDLVAu,     //   provided as low and high halves
-      VADDLVps,     // Same as VADDLV[su] but with a v4i1 predicate mask
-      VADDLVpu,
-      VADDLVAps,    // Same as VADDLVp[su] but with a v4i1 predicate mask
-      VADDLVApu,
-      VMLAVs,       // sign- or zero-extend the elements of two vectors to i32, multiply them
-      VMLAVu,       //   and add the results together, returning an i32 of their sum
-      VMLAVps,      // Same as VMLAV[su] with a v4i1 predicate mask
-      VMLAVpu,
-      VMLALVs,      // Same as VMLAV but with i64, returning the low and
-      VMLALVu,      //   high 32-bit halves of the sum
-      VMLALVps,     // Same as VMLALV[su] with a v4i1 predicate mask
-      VMLALVpu,
-      VMLALVAs,     // Same as VMLALV but also add an input accumulator
-      VMLALVAu,     //   provided as low and high halves
-      VMLALVAps,    // Same as VMLALVA[su] with a v4i1 predicate mask
-      VMLALVApu,
-      VMINVu,        // Find minimum unsigned value of a vector and register
-      VMINVs,        // Find minimum signed value of a vector and register
-      VMAXVu,        // Find maximum unsigned value of a vector and register
-      VMAXVs,        // Find maximum signed value of a vector and register
-
-      SMULWB,       // Signed multiply word by half word, bottom
-      SMULWT,       // Signed multiply word by half word, top
-      UMLAL,        // 64bit Unsigned Accumulate Multiply
-      SMLAL,        // 64bit Signed Accumulate Multiply
-      UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
-      SMLALBB,      // 64-bit signed accumulate multiply bottom, bottom 16
-      SMLALBT,      // 64-bit signed accumulate multiply bottom, top 16
-      SMLALTB,      // 64-bit signed accumulate multiply top, bottom 16
-      SMLALTT,      // 64-bit signed accumulate multiply top, top 16
-      SMLALD,       // Signed multiply accumulate long dual
-      SMLALDX,      // Signed multiply accumulate long dual exchange
-      SMLSLD,       // Signed multiply subtract long dual
-      SMLSLDX,      // Signed multiply subtract long dual exchange
-      SMMLAR,       // Signed multiply long, round and add
-      SMMLSR,       // Signed multiply long, subtract and round
-
-      // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
-      QADD8b,
-      QSUB8b,
-      QADD16b,
-      QSUB16b,
-
-      // Operands of the standard BUILD_VECTOR node are not legalized, which
-      // is fine if BUILD_VECTORs are always lowered to shuffles or other
-      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
-      // operands need to be legalized.  Define an ARM-specific version of
-      // BUILD_VECTOR for this purpose.
-      BUILD_VECTOR,
-
-      // Bit-field insert
-      BFI,
-
-      // Vector OR with immediate
-      VORRIMM,
-      // Vector AND with NOT of immediate
-      VBICIMM,
-
-      // Pseudo vector bitwise select
-      VBSP,
-
-      // Pseudo-instruction representing a memory copy using ldm/stm
-      // instructions.
-      MEMCPY,
-
-      // V8.1MMainline condition select
-      CSINV, // Conditional select invert.
-      CSNEG, // Conditional select negate.
-      CSINC, // Conditional select increment.
-
-      // Vector load N-element structure to all lanes:
-      VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      VLD2DUP,
-      VLD3DUP,
-      VLD4DUP,
-
-      // NEON loads with post-increment base updates:
-      VLD1_UPD,
-      VLD2_UPD,
-      VLD3_UPD,
-      VLD4_UPD,
-      VLD2LN_UPD,
-      VLD3LN_UPD,
-      VLD4LN_UPD,
-      VLD1DUP_UPD,
-      VLD2DUP_UPD,
-      VLD3DUP_UPD,
-      VLD4DUP_UPD,
-
-      // NEON stores with post-increment base updates:
-      VST1_UPD,
-      VST2_UPD,
-      VST3_UPD,
-      VST4_UPD,
-      VST2LN_UPD,
-      VST3LN_UPD,
-      VST4LN_UPD,
-
-      // Load/Store of dual registers
-      LDRD,
-      STRD
-    };
+  enum NodeType : unsigned {
+    // Start the numbering where the builtin ops and target ops leave off.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    Wrapper,    // Wrapper - A wrapper node for TargetConstantPool,
+                // TargetExternalSymbol, and TargetGlobalAddress.
+    WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
+                // PIC mode.
+    WrapperJT,  // WrapperJT - A wrapper node for TargetJumpTable
+
+    // Add pseudo op to model memcpy for struct byval.
+    COPY_STRUCT_BYVAL,
+
+    CALL,        // Function call.
+    CALL_PRED,   // Function call that's predicable.
+    CALL_NOLINK, // Function call with branch not branch-and-link.
+    tSECALL,     // CMSE non-secure function call.
+    BRCOND,      // Conditional branch.
+    BR_JT,       // Jumptable branch.
+    BR2_JT,      // Jumptable branch (2 level - jumptable entry is a jump).
+    RET_FLAG,    // Return with a flag operand.
+    SERET_FLAG,  // CMSE Entry function return with a flag operand.
+    INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand.
+
+    PIC_ADD, // Add with a PC operand and a PIC label.
+
+    ASRL, // MVE long arithmetic shift right.
+    LSRL, // MVE long shift right.
+    LSLL, // MVE long shift left.
+
+    CMP,      // ARM compare instructions.
+    CMN,      // ARM CMN instructions.
+    CMPZ,     // ARM compare that sets only Z flag.
+    CMPFP,    // ARM VFP compare instruction, sets FPSCR.
+    CMPFPE,   // ARM VFP signalling compare instruction, sets FPSCR.
+    CMPFPw0,  // ARM VFP compare against zero instruction, sets FPSCR.
+    CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets
+              // FPSCR.
+    FMSTAT,   // ARM fmstat instruction.
+
+    CMOV, // ARM conditional move instructions.
+    SUBS, // Flag-setting subtraction.
+
+    SSAT, // Signed saturation
+    USAT, // Unsigned saturation
+
+    BCC_i64,
+
+    SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+    SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+    RRX,      // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+    ADDC, // Add with carry
+    ADDE, // Add using carry
+    SUBC, // Sub with carry
+    SUBE, // Sub using carry
+    LSLS, // Shift left producing carry
+
+    VMOVRRD, // double to two gprs.
+    VMOVDRR, // Two gprs to double.
+    VMOVSR,  // move gpr to single, used for f32 literal constructed in a gpr
+
+    EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+    EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+    EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+
+    TC_RETURN, // Tail call return pseudo.
+
+    THREAD_POINTER,
+
+    DYN_ALLOC, // Dynamic allocation on the stack.
+
+    MEMBARRIER_MCR, // Memory barrier (MCR)
+
+    PRELOAD, // Preload
+
+    WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
+    WIN__DBZCHK, // Windows' divide by zero check
+
+    WLS, // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart
+    WLSSETUP, // Setup for the iteration count of a WLS. See t2WhileLoopSetup.
+    LOOP_DEC, // Really a part of LE, performs the sub
+    LE,       // Low-overhead loops, Loop End
+
+    PREDICATE_CAST,  // Predicate cast for MVE i1 types
+    VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
+
+    VCMP,  // Vector compare.
+    VCMPZ, // Vector compare to zero.
+    VTST,  // Vector test bits.
+
+    // Vector shift by vector
+    VSHLs, // ...left/right by signed
+    VSHLu, // ...left/right by unsigned
+
+    // Vector shift by immediate:
+    VSHLIMM,  // ...left
+    VSHRsIMM, // ...right (signed)
+    VSHRuIMM, // ...right (unsigned)
+
+    // Vector rounding shift by immediate:
+    VRSHRsIMM, // ...right (signed)
+    VRSHRuIMM, // ...right (unsigned)
+    VRSHRNIMM, // ...right narrow
+
+    // Vector saturating shift by immediate:
+    VQSHLsIMM,   // ...left (signed)
+    VQSHLuIMM,   // ...left (unsigned)
+    VQSHLsuIMM,  // ...left (signed to unsigned)
+    VQSHRNsIMM,  // ...right narrow (signed)
+    VQSHRNuIMM,  // ...right narrow (unsigned)
+    VQSHRNsuIMM, // ...right narrow (signed to unsigned)
+
+    // Vector saturating rounding shift by immediate:
+    VQRSHRNsIMM,  // ...right narrow (signed)
+    VQRSHRNuIMM,  // ...right narrow (unsigned)
+    VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
+
+    // Vector shift and insert:
+    VSLIIMM, // ...left
+    VSRIIMM, // ...right
+
+    // Vector get lane (VMOV scalar to ARM core register)
+    // (These are used for 8- and 16-bit element types only.)
+    VGETLANEu, // zero-extend vector extract element
+    VGETLANEs, // sign-extend vector extract element
+
+    // Vector move immediate and move negated immediate:
+    VMOVIMM,
+    VMVNIMM,
+
+    // Vector move f32 immediate:
+    VMOVFPIMM,
+
+    // Move H <-> R, clearing top 16 bits
+    VMOVrh,
+    VMOVhr,
+
+    // Vector duplicate:
+    VDUP,
+    VDUPLANE,
+
+    // Vector shuffles:
+    VEXT,   // extract
+    VREV64, // reverse elements within 64-bit doublewords
+    VREV32, // reverse elements within 32-bit words
+    VREV16, // reverse elements within 16-bit halfwords
+    VZIP,   // zip (interleave)
+    VUZP,   // unzip (deinterleave)
+    VTRN,   // transpose
+    VTBL1,  // 1-register shuffle with mask
+    VTBL2,  // 2-register shuffle with mask
+    VMOVN,  // MVE vmovn
+
+    // MVE Saturating truncates
+    VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
+    VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
+
+    // MVE float <> half converts
+    VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top
+           // lanes
+    VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
+
+    // Vector multiply long:
+    VMULLs, // ...signed
+    VMULLu, // ...unsigned
+
+    VQDMULH, // MVE vqdmulh instruction
+
+    // MVE reductions
+    VADDVs,  // sign- or zero-extend the elements of a vector to i32,
+    VADDVu,  //   add them all together, and return an i32 of their sum
+    VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask
+    VADDVpu,
+    VADDLVs,  // sign- or zero-extend elements to i64 and sum, returning
+    VADDLVu,  //   the low and high 32-bit halves of the sum
+    VADDLVAs, // Same as VADDLV[su] but also add an input accumulator
+    VADDLVAu, //   provided as low and high halves
+    VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask
+    VADDLVpu,
+    VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
+    VADDLVApu,
+    VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply
+            // them
+    VMLAVu, //   and add the results together, returning an i32 of their sum
+    VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
+    VMLAVpu,
+    VMLALVs,  // Same as VMLAV but with i64, returning the low and
+    VMLALVu,  //   high 32-bit halves of the sum
+    VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask
+    VMLALVpu,
+    VMLALVAs,  // Same as VMLALV but also add an input accumulator
+    VMLALVAu,  //   provided as low and high halves
+    VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask
+    VMLALVApu,
+    VMINVu, // Find minimum unsigned value of a vector and register
+    VMINVs, // Find minimum signed value of a vector and register
+    VMAXVu, // Find maximum unsigned value of a vector and register
+    VMAXVs, // Find maximum signed value of a vector and register
+
+    SMULWB,  // Signed multiply word by half word, bottom
+    SMULWT,  // Signed multiply word by half word, top
+    UMLAL,   // 64bit Unsigned Accumulate Multiply
+    SMLAL,   // 64bit Signed Accumulate Multiply
+    UMAAL,   // 64-bit Unsigned Accumulate Accumulate Multiply
+    SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16
+    SMLALBT, // 64-bit signed accumulate multiply bottom, top 16
+    SMLALTB, // 64-bit signed accumulate multiply top, bottom 16
+    SMLALTT, // 64-bit signed accumulate multiply top, top 16
+    SMLALD,  // Signed multiply accumulate long dual
+    SMLALDX, // Signed multiply accumulate long dual exchange
+    SMLSLD,  // Signed multiply subtract long dual
+    SMLSLDX, // Signed multiply subtract long dual exchange
+    SMMLAR,  // Signed multiply long, round and add
+    SMMLSR,  // Signed multiply long, subtract and round
+
+    // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b
+    // stands for.
+    QADD8b,
+    QSUB8b,
+    QADD16b,
+    QSUB16b,
+
+    // Operands of the standard BUILD_VECTOR node are not legalized, which
+    // is fine if BUILD_VECTORs are always lowered to shuffles or other
+    // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+    // operands need to be legalized.  Define an ARM-specific version of
+    // BUILD_VECTOR for this purpose.
+    BUILD_VECTOR,
+
+    // Bit-field insert
+    BFI,
+
+    // Vector OR with immediate
+    VORRIMM,
+    // Vector AND with NOT of immediate
+    VBICIMM,
+
+    // Pseudo vector bitwise select
+    VBSP,
+
+    // Pseudo-instruction representing a memory copy using ldm/stm
+    // instructions.
+    MEMCPY,
+    // Pseudo-instruction representing a memory copy using a tail predicated
+    // loop
+    MEMCPYLOOP,
+
+    // V8.1MMainline condition select
+    CSINV, // Conditional select invert.
+    CSNEG, // Conditional select negate.
+    CSINC, // Conditional select increment.
+
+    // Vector load N-element structure to all lanes:
+    VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    VLD2DUP,
+    VLD3DUP,
+    VLD4DUP,
+
+    // NEON loads with post-increment base updates:
+    VLD1_UPD,
+    VLD2_UPD,
+    VLD3_UPD,
+    VLD4_UPD,
+    VLD2LN_UPD,
+    VLD3LN_UPD,
+    VLD4LN_UPD,
+    VLD1DUP_UPD,
+    VLD2DUP_UPD,
+    VLD3DUP_UPD,
+    VLD4DUP_UPD,
+
+    // NEON stores with post-increment base updates:
+    VST1_UPD,
+    VST2_UPD,
+    VST3_UPD,
+    VST4_UPD,
+    VST2LN_UPD,
+    VST3LN_UPD,
+    VST4LN_UPD,
+
+    // Load/Store of dual registers
+    LDRD,
+    STRD
+  };
 
   } // end namespace ARMISD
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1813,6 +1813,8 @@
   case ARMISD::CSINV:         return "ARMISD::CSINV";
   case ARMISD::CSNEG:         return "ARMISD::CSNEG";
   case ARMISD::CSINC:         return "ARMISD::CSINC";
+  case ARMISD::MEMCPYLOOP:
+    return "ARMISD::MEMCPYLOOP";
   }
   return nullptr;
 }
@@ -11071,6 +11073,141 @@
   return true;
 }
 
+/// Adds logic in loop entry MBB to calculate loop iteration count and adds
+/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
+static Register genTPEntry(MachineBasicBlock *TpEntry,
+                           MachineBasicBlock *TpLoopBody,
+                           MachineBasicBlock *TpExit, Register OpSizeReg,
+                           const TargetInstrInfo *TII, DebugLoc Dl,
+                           MachineRegisterInfo &MRI) {
+
+  // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
+  Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
+      .addUse(OpSizeReg)
+      .addImm(15)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
+      .addUse(AddDestReg, RegState::Kill)
+      .addImm(16)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
+      .addUse(BicDestReg, RegState::Kill)
+      .addImm(4)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
+      .addUse(LsrDestReg, RegState::Kill);
+
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
+      .addUse(TotalIterationsReg)
+      .addMBB(TpExit);
+
+  return TotalIterationsReg;
+}
+
+/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
+/// t2DoLoopEnd. These are used by later passes to generate tail predicated
+/// loops.
+static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
+                          MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
+                          const TargetInstrInfo *TII, DebugLoc Dl,
+                          MachineRegisterInfo &MRI, Register OpSrcReg,
+                          Register OpDestReg, Register ElementCountReg,
+                          Register TotalIterationsReg) {
+
+  // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
+  // iteration counter, predication counter Current position in the src array
+  Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
+      .addUse(OpSrcReg)
+      .addMBB(TpEntry)
+      .addUse(CurrSrcReg)
+      .addMBB(TpLoopBody);
+
+  // Current position in the dest array
+  Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
+      .addUse(OpDestReg)
+      .addMBB(TpEntry)
+      .addUse(CurrDestReg)
+      .addMBB(TpLoopBody);
+
+  // Current loop counter
+  Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  Register RemainingLoopIterationsReg =
+      MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
+      .addUse(TotalIterationsReg)
+      .addMBB(TpEntry)
+      .addUse(RemainingLoopIterationsReg)
+      .addMBB(TpLoopBody);
+
+  // Predication counter
+  Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
+      .addUse(ElementCountReg)
+      .addMBB(TpEntry)
+      .addUse(RemainingElementsReg)
+      .addMBB(TpLoopBody);
+
+  // Pass predication counter to VCTP
+  Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
+      .addUse(PredCounterPhiReg)
+      .addImm(ARMVCC::None)
+      .addReg(0);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
+      .addUse(PredCounterPhiReg)
+      .addImm(16)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  // VLDRB and VSTRB instructions, predicated using VPR
+  Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
+      .addDef(CurrSrcReg)
+      .addDef(LoadedValueReg)
+      .addReg(SrcPhiReg)
+      .addImm(16)
+      .addImm(ARMVCC::Then)
+      .addUse(VccrReg);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
+      .addDef(CurrDestReg)
+      .addUse(LoadedValueReg, RegState::Kill)
+      .addReg(DestPhiReg)
+      .addImm(16)
+      .addImm(ARMVCC::Then)
+      .addUse(VccrReg);
+
+  // Add the pseudoInstrs for decrementing the loop counter and marking the
+  // end:t2DoLoopDec and t2DoLoopEnd
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
+      .addUse(LoopCounterPhiReg)
+      .addImm(1);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
+      .addUse(RemainingLoopIterationsReg)
+      .addMBB(TpLoopBody);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
+      .addMBB(TpExit)
+      .add(predOps(ARMCC::AL));
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -11097,6 +11234,82 @@
     return BB;
   }
 
+  case ARM::MVE_MEMCPYLOOPINST: {
+
+    // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
+    // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
+    // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
+    // adds the relevant instructions in the TP loop Body for generation of a
+    // WLSTP loop.
+
+    // Below is relevant portion of the CFG after the transformation.
+    // The Machine Basic Blocks are shown along with branch conditions (in
+    // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
+    // portion of the CFG and may not necessarily be the entry/exit of the
+    // function.
+
+    //             (Relevant) CFG after transformation:
+    //               TP entry MBB
+    //                   |
+    //          |-----------------|
+    //       (n <= 0)          (n > 0)
+    //          |                 |
+    //          |         TP loop Body MBB
+    //          \                |
+    //            \             /
+    //              TP exit MBB
+
+    MachineFunction *MF = BB->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    Register OpDestReg = MI.getOperand(0).getReg();
+    Register OpSrcReg = MI.getOperand(1).getReg();
+    Register OpSizeReg = MI.getOperand(2).getReg();
+
+    // Allocate the required MBBs and add to parent function.
+    MachineBasicBlock *TpEntry = BB;
+    MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
+    MachineBasicBlock *TpExit;
+
+    MF->push_back(TpLoopBody);
+
+    // If any instructions are present in the current block after
+    // MVE_MEMCPYLOOPINST, move them into the exit block. This is required since
+    // a terminator(t2WhileLoopStart) will be placed at that site. If no
+    // instructions are present after MVE_MEMCPYLOOPINST, then fallthrough is
+    // the exit.
+    TpExit = BB->splitAt(MI, false);
+    if (TpExit == BB) {
+      assert(BB->canFallThrough() &&
+             "exit Block must be Fallthrough of the block containing memcpy");
+      TpExit = BB->getFallThrough();
+    }
+
+    // Add logic for iteration count
+    Register TotalIterationsReg =
+        genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
+
+    // Add the vectorized (and predicated) loads/store instructions
+    genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
+                  OpDestReg, OpSizeReg, TotalIterationsReg);
+
+    // Connect the blocks
+    TpEntry->addSuccessor(TpLoopBody);
+    TpLoopBody->addSuccessor(TpLoopBody);
+    TpLoopBody->addSuccessor(TpExit);
+
+    // Reorder for a more natural layout
+    TpLoopBody->moveAfter(TpEntry);
+    TpExit->moveAfter(TpLoopBody);
+
+    // Finally, remove the memcpy Psuedo Instruction
+    MI.eraseFromParent();
+
+    // Return the exit block as it may contain other instructions requiring a
+    // custom inserter
+    return TpExit;
+  }
+
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -6864,6 +6864,18 @@
   let isTerminator = 1;
 }
 
+def SDT_MVEMEMCPYLOOPNODE
+    : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
+                                [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+  def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
+        (ins rGPR:$dst, rGPR:$src, rGPR:$sz),
+        NoItinerary,
+        [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
+}
+
 def MVE_DLSTP_8  : MVE_DLSTP<"dlstp.8",  0b00>;
 def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
 def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Support/CommandLine.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -17,6 +17,11 @@
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
+static cl::opt<cl::boolOrDefault>
+    EnableMemcpyTPLoop("arm-memcpy-tploop", cl::Hidden,
+                       cl::desc("Enable/disable conversion of llvm.memcpy to "
+                                "Tail predicated loops (WLSTP)"));
+
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
@@ -130,13 +135,31 @@
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  auto GenInlineTP = [=](const ARMSubtarget &Subtarget,
+                         const SelectionDAG &DAG) {
+    return Subtarget.hasMVEIntegerOps() &&
+           !DAG.getMachineFunction().getFunction().hasOptNone() &&
+           ((!ConstantSize && (Alignment >= Align(4))) ||
+            (ConstantSize &&
+             ConstantSize->getZExtValue() >
+                 Subtarget.getMaxInlineSizeThreshold() &&
+             ConstantSize->getZExtValue() <
+                 Subtarget.getMaxTPLoopInlineSizeThreshold()));
+  };
+
+  if ((EnableMemcpyTPLoop == cl::BOU_TRUE) ||
+      (EnableMemcpyTPLoop == cl::BOU_UNSET && GenInlineTP(Subtarget, DAG)))
+    return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
+                       Size);
+
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if (Alignment < Align(4))
     return SDValue();
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   Alignment.value(), RTLIB::MEMCPY);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -538,6 +538,11 @@
     return 64;
   }
 
+  /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
+  /// that still makes it profitable to inline the call as a Tail
+  /// Predicated loop
+  unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -4,31 +4,36 @@
 define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB0_3
+; CHECK-NEXT:    blt .LBB0_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov r8, r3
-; CHECK-NEXT:    mov r5, r2
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    lsls r4, r3, #2
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    lsl.w r12, r3, #2
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
-; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r0, r7, r6
-; CHECK-NEXT:    add.w r1, r9, r6
-; CHECK-NEXT:    mov r2, r8
-; CHECK-NEXT:    bl __aeabi_memcpy4
-; CHECK-NEXT:    add r6, r4
-; CHECK-NEXT:    subs r5, #1
-; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  .LBB0_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    adds r4, r1, r7
+; CHECK-NEXT:    adds r5, r0, r7
+; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    wlstp.8 lr, r6, .LBB0_3
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_3: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    add r7, r12
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    beq .LBB0_5
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrb.u8 q0, [r4], #16
+; CHECK-NEXT:    vstrb.8 q0, [r5], #16
+; CHECK-NEXT:    letp lr, .LBB0_4
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %cmp8 = icmp sgt i32 %n, 0
   br i1 %cmp8, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 --verify-machineinstrs %s -o - | FileCheck %s
+
+; Check that WLSTP loop is not generated for alignment < 4
+; void test1(char* dest, char* src, int n){
+;    memcpy(dest, src, n);
+; }
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
+
+define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
+; CHECK-LABEL: test1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl __aeabi_memcpy
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false)
+  ret void
+}
+
+
+; Check that WLSTP loop is generated for alignment >= 4
+; void test2(int* restrict X, int* restrict Y, int n){
+;     memcpy(X, Y, n);
+; }
+
+
+define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){
+; CHECK-LABEL: test2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_2
+; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB1_1
+; CHECK-NEXT:  .LBB1_2: @ %entry
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+  ret void
+}
+
+
+; Checks that transform handles some arithmetic on the input arguments.
+; void test3(int* restrict X, int* restrict Y, int n)
+; {
+;     memcpy(X+2, Y+3, (n*2)+10);
+; }
+
+define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
+; CHECK-LABEL: test3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #10
+; CHECK-NEXT:    add.w r2, r3, r2, lsl #1
+; CHECK-NEXT:    adds r0, #8
+; CHECK-NEXT:    adds r1, #12
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB2_2
+; CHECK-NEXT:  .LBB2_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB2_1
+; CHECK-NEXT:  .LBB2_2: @ %entry
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %X, i32 2
+  %0 = bitcast i32* %add.ptr to i8*
+  %add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3
+  %1 = bitcast i32* %add.ptr1 to i8*
+  %mul = shl nsw i32 %n, 1
+  %add = add nsw i32 %mul, 10
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false)
+  ret void
+}
+
+
+; Checks that transform handles for loops that are implicitly converted to mempcy
+; void test4(int* restrict X, int* restrict Y, int n){
+;     for(int i = 0; i < n; ++i){
+;         X[i] = Y[i];
+;     }
+; }
+
+define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
+; CHECK-LABEL: test4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    poplt {r7, pc}
+; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB3_3
+; CHECK-NEXT:  .LBB3_2: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB3_2
+; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %X.bits = bitcast i32* %X to i8*
+  %Y.bits = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s
+--- |
+  ; ModuleID = 'llvm/test/CodeGen/Thumb2/mve_tp_loop.ll'
+  source_filename = "llvm/test/CodeGen/Thumb2/mve_tp_loop.ll"
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "arm-arm-none-eabi"
+
+  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
+  declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #0
+
+  define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 {
+  entry:
+    %0 = bitcast i32* %X to i8*
+    %1 = bitcast i32* %Y to i8*
+    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+    ret void
+  }
+
+  define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    %X.bits = bitcast i32* %X to i8*
+    %Y.bits = bitcast i32* %Y to i8*
+    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
+    ret void
+  }
+
+  attributes #0 = { argmemonly nofree nosync nounwind willreturn "target-cpu"="cortex-m55" }
+  attributes #1 = { "target-cpu"="cortex-m55" }
+
+...
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0, $r1, $r2
+
+    ; CHECK-LABEL: name: test1
+    ; CHECK: liveins: $r0, $r1, $r2
+    ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
+    ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
+    ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+    ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+    ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
+    ; CHECK: .1:
+    ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
+    ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
+    ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
+    ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
+    ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
+    ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
+    ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+    ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
+    ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
+    ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
+    ; CHECK: .2.entry:
+    ; CHECK: tBX_RET 14 /* CC::al */, $noreg
+    %2:rgpr = COPY $r2
+    %1:rgpr = COPY $r1
+    %0:rgpr = COPY $r0
+    MVE_MEMCPYLOOPINST %0, %1, %2
+    tBX_RET 14 /* CC::al */, $noreg
+
+...
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test2
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   [[COPY:%[0-9]+]]:rgpr = COPY $r2
+  ; CHECK:   [[COPY1:%[0-9]+]]:rgpr = COPY $r1
+  ; CHECK:   [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+  ; CHECK:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+  ; CHECK:   t2B %bb.1, 14 /* CC::al */, $noreg
+  ; CHECK: bb.1.for.body.preheader:
+  ; CHECK:   successors: %bb.2(0x80000000), %bb.3(0x00000000)
+  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+  ; CHECK:   t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
+  ; CHECK: bb.3:
+  ; CHECK:   [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
+  ; CHECK:   [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
+  ; CHECK:   [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
+  ; CHECK:   [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
+  ; CHECK:   [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
+  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
+  ; CHECK:   [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+  ; CHECK:   [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
+  ; CHECK:   t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
+  ; CHECK:   t2B %bb.2, 14 /* CC::al */, $noreg
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %2:rgpr = COPY $r2
+    %1:rgpr = COPY $r1
+    %0:rgpr = COPY $r0
+    t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+    t2B %bb.1, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    MVE_MEMCPYLOOPINST %0, %1, %2
+
+  bb.2.for.cond.cleanup:
+    tBX_RET 14 /* CC::al */, $noreg
+
+...