diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -366,7 +366,8 @@ bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { return MI->getOpcode() == ARM::t2LoopEndDec || - MI->getOpcode() == ARM::t2DoLoopStartTP; + MI->getOpcode() == ARM::t2DoLoopStartTP || + MI->getOpcode() == ARM::t2WhileLoopStartLR; } private: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -52,284 +52,291 @@ namespace ARMISD { // ARM Specific DAG Nodes - enum NodeType : unsigned { - // Start the numbering where the builtin ops and target ops leave off. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - Wrapper, // Wrapper - A wrapper node for TargetConstantPool, - // TargetExternalSymbol, and TargetGlobalAddress. - WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in - // PIC mode. - WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable - - // Add pseudo op to model memcpy for struct byval. - COPY_STRUCT_BYVAL, - - CALL, // Function call. - CALL_PRED, // Function call that's predicable. - CALL_NOLINK, // Function call with branch not branch-and-link. - tSECALL, // CMSE non-secure function call. - BRCOND, // Conditional branch. - BR_JT, // Jumptable branch. - BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). - RET_FLAG, // Return with a flag operand. - SERET_FLAG, // CMSE Entry function return with a flag operand. - INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. - - PIC_ADD, // Add with a PC operand and a PIC label. - - ASRL, // MVE long arithmetic shift right. - LSRL, // MVE long shift right. - LSLL, // MVE long shift left. - - CMP, // ARM compare instructions. - CMN, // ARM CMN instructions. - CMPZ, // ARM compare that sets only Z flag. - CMPFP, // ARM VFP compare instruction, sets FPSCR. - CMPFPE, // ARM VFP signalling compare instruction, sets FPSCR. - CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. - CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets FPSCR. - FMSTAT, // ARM fmstat instruction. - - CMOV, // ARM conditional move instructions. - SUBS, // Flag-setting subtraction. - - SSAT, // Signed saturation - USAT, // Unsigned saturation - - BCC_i64, - - SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. - SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. - RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. - - ADDC, // Add with carry - ADDE, // Add using carry - SUBC, // Sub with carry - SUBE, // Sub using carry - LSLS, // Shift left producing carry - - VMOVRRD, // double to two gprs. - VMOVDRR, // Two gprs to double. - VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr - - EH_SJLJ_SETJMP, // SjLj exception handling setjmp. - EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. - EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. - - TC_RETURN, // Tail call return pseudo. - - THREAD_POINTER, - - DYN_ALLOC, // Dynamic allocation on the stack. - - MEMBARRIER_MCR, // Memory barrier (MCR) - - PRELOAD, // Preload - - WIN__CHKSTK, // Windows' __chkstk call to do stack probing. - WIN__DBZCHK, // Windows' divide by zero check - - WLS, // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart - WLSSETUP, // Setup for the iteration count of a WLS. See t2WhileLoopSetup. - LOOP_DEC, // Really a part of LE, performs the sub - LE, // Low-overhead loops, Loop End - - PREDICATE_CAST, // Predicate cast for MVE i1 types - VECTOR_REG_CAST, // Reinterpret the current contents of a vector register - - VCMP, // Vector compare. - VCMPZ, // Vector compare to zero. - VTST, // Vector test bits. - - // Vector shift by vector - VSHLs, // ...left/right by signed - VSHLu, // ...left/right by unsigned - - // Vector shift by immediate: - VSHLIMM, // ...left - VSHRsIMM, // ...right (signed) - VSHRuIMM, // ...right (unsigned) - - // Vector rounding shift by immediate: - VRSHRsIMM, // ...right (signed) - VRSHRuIMM, // ...right (unsigned) - VRSHRNIMM, // ...right narrow - - // Vector saturating shift by immediate: - VQSHLsIMM, // ...left (signed) - VQSHLuIMM, // ...left (unsigned) - VQSHLsuIMM, // ...left (signed to unsigned) - VQSHRNsIMM, // ...right narrow (signed) - VQSHRNuIMM, // ...right narrow (unsigned) - VQSHRNsuIMM, // ...right narrow (signed to unsigned) - - // Vector saturating rounding shift by immediate: - VQRSHRNsIMM, // ...right narrow (signed) - VQRSHRNuIMM, // ...right narrow (unsigned) - VQRSHRNsuIMM, // ...right narrow (signed to unsigned) - - // Vector shift and insert: - VSLIIMM, // ...left - VSRIIMM, // ...right - - // Vector get lane (VMOV scalar to ARM core register) - // (These are used for 8- and 16-bit element types only.) - VGETLANEu, // zero-extend vector extract element - VGETLANEs, // sign-extend vector extract element - - // Vector move immediate and move negated immediate: - VMOVIMM, - VMVNIMM, - - // Vector move f32 immediate: - VMOVFPIMM, - - // Move H <-> R, clearing top 16 bits - VMOVrh, - VMOVhr, - - // Vector duplicate: - VDUP, - VDUPLANE, - - // Vector shuffles: - VEXT, // extract - VREV64, // reverse elements within 64-bit doublewords - VREV32, // reverse elements within 32-bit words - VREV16, // reverse elements within 16-bit halfwords - VZIP, // zip (interleave) - VUZP, // unzip (deinterleave) - VTRN, // transpose - VTBL1, // 1-register shuffle with mask - VTBL2, // 2-register shuffle with mask - VMOVN, // MVE vmovn - - // MVE Saturating truncates - VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s) - VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u) - - // MVE float <> half converts - VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes - VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes - - // Vector multiply long: - VMULLs, // ...signed - VMULLu, // ...unsigned - - VQDMULH, // MVE vqdmulh instruction - - // MVE reductions - VADDVs, // sign- or zero-extend the elements of a vector to i32, - VADDVu, // add them all together, and return an i32 of their sum - VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask - VADDVpu, - VADDLVs, // sign- or zero-extend elements to i64 and sum, returning - VADDLVu, // the low and high 32-bit halves of the sum - VADDLVAs, // Same as VADDLV[su] but also add an input accumulator - VADDLVAu, // provided as low and high halves - VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask - VADDLVpu, - VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask - VADDLVApu, - VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply them - VMLAVu, // and add the results together, returning an i32 of their sum - VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask - VMLAVpu, - VMLALVs, // Same as VMLAV but with i64, returning the low and - VMLALVu, // high 32-bit halves of the sum - VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask - VMLALVpu, - VMLALVAs, // Same as VMLALV but also add an input accumulator - VMLALVAu, // provided as low and high halves - VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask - VMLALVApu, - VMINVu, // Find minimum unsigned value of a vector and register - VMINVs, // Find minimum signed value of a vector and register - VMAXVu, // Find maximum unsigned value of a vector and register - VMAXVs, // Find maximum signed value of a vector and register - - SMULWB, // Signed multiply word by half word, bottom - SMULWT, // Signed multiply word by half word, top - UMLAL, // 64bit Unsigned Accumulate Multiply - SMLAL, // 64bit Signed Accumulate Multiply - UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply - SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 - SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 - SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 - SMLALTT, // 64-bit signed accumulate multiply top, top 16 - SMLALD, // Signed multiply accumulate long dual - SMLALDX, // Signed multiply accumulate long dual exchange - SMLSLD, // Signed multiply subtract long dual - SMLSLDX, // Signed multiply subtract long dual exchange - SMMLAR, // Signed multiply long, round and add - SMMLSR, // Signed multiply long, subtract and round - - // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for. - QADD8b, - QSUB8b, - QADD16b, - QSUB16b, - - // Operands of the standard BUILD_VECTOR node are not legalized, which - // is fine if BUILD_VECTORs are always lowered to shuffles or other - // operations, but for ARM some BUILD_VECTORs are legal as-is and their - // operands need to be legalized. Define an ARM-specific version of - // BUILD_VECTOR for this purpose. - BUILD_VECTOR, - - // Bit-field insert - BFI, - - // Vector OR with immediate - VORRIMM, - // Vector AND with NOT of immediate - VBICIMM, - - // Pseudo vector bitwise select - VBSP, - - // Pseudo-instruction representing a memory copy using ldm/stm - // instructions. - MEMCPY, - - // V8.1MMainline condition select - CSINV, // Conditional select invert. - CSNEG, // Conditional select negate. - CSINC, // Conditional select increment. - - // Vector load N-element structure to all lanes: - VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, - VLD2DUP, - VLD3DUP, - VLD4DUP, - - // NEON loads with post-increment base updates: - VLD1_UPD, - VLD2_UPD, - VLD3_UPD, - VLD4_UPD, - VLD2LN_UPD, - VLD3LN_UPD, - VLD4LN_UPD, - VLD1DUP_UPD, - VLD2DUP_UPD, - VLD3DUP_UPD, - VLD4DUP_UPD, - - // NEON stores with post-increment base updates: - VST1_UPD, - VST2_UPD, - VST3_UPD, - VST4_UPD, - VST2LN_UPD, - VST3LN_UPD, - VST4LN_UPD, - - // Load/Store of dual registers - LDRD, - STRD - }; + enum NodeType : unsigned { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + Wrapper, // Wrapper - A wrapper node for TargetConstantPool, + // TargetExternalSymbol, and TargetGlobalAddress. + WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in + // PIC mode. + WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + + // Add pseudo op to model memcpy for struct byval. + COPY_STRUCT_BYVAL, + + CALL, // Function call. + CALL_PRED, // Function call that's predicable. + CALL_NOLINK, // Function call with branch not branch-and-link. + tSECALL, // CMSE non-secure function call. + BRCOND, // Conditional branch. + BR_JT, // Jumptable branch. + BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). + RET_FLAG, // Return with a flag operand. + SERET_FLAG, // CMSE Entry function return with a flag operand. + INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. + + PIC_ADD, // Add with a PC operand and a PIC label. + + ASRL, // MVE long arithmetic shift right. + LSRL, // MVE long shift right. + LSLL, // MVE long shift left. + + CMP, // ARM compare instructions. + CMN, // ARM CMN instructions. + CMPZ, // ARM compare that sets only Z flag. + CMPFP, // ARM VFP compare instruction, sets FPSCR. + CMPFPE, // ARM VFP signalling compare instruction, sets FPSCR. + CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. + CMPFPEw0, // ARM VFP signalling compare against zero instruction, sets + // FPSCR. + FMSTAT, // ARM fmstat instruction. + + CMOV, // ARM conditional move instructions. + SUBS, // Flag-setting subtraction. + + SSAT, // Signed saturation + USAT, // Unsigned saturation + + BCC_i64, + + SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. + SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. + RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. + + ADDC, // Add with carry + ADDE, // Add using carry + SUBC, // Sub with carry + SUBE, // Sub using carry + LSLS, // Shift left producing carry + + VMOVRRD, // double to two gprs. + VMOVDRR, // Two gprs to double. + VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr + + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. + + TC_RETURN, // Tail call return pseudo. + + THREAD_POINTER, + + DYN_ALLOC, // Dynamic allocation on the stack. + + MEMBARRIER_MCR, // Memory barrier (MCR) + + PRELOAD, // Preload + + WIN__CHKSTK, // Windows' __chkstk call to do stack probing. + WIN__DBZCHK, // Windows' divide by zero check + + WLS, // Low-overhead loops, While Loop Start branch. See t2WhileLoopStart + WLSSETUP, // Setup for the iteration count of a WLS. See t2WhileLoopSetup. + LOOP_DEC, // Really a part of LE, performs the sub + LE, // Low-overhead loops, Loop End + + PREDICATE_CAST, // Predicate cast for MVE i1 types + VECTOR_REG_CAST, // Reinterpret the current contents of a vector register + + VCMP, // Vector compare. + VCMPZ, // Vector compare to zero. + VTST, // Vector test bits. + + // Vector shift by vector + VSHLs, // ...left/right by signed + VSHLu, // ...left/right by unsigned + + // Vector shift by immediate: + VSHLIMM, // ...left + VSHRsIMM, // ...right (signed) + VSHRuIMM, // ...right (unsigned) + + // Vector rounding shift by immediate: + VRSHRsIMM, // ...right (signed) + VRSHRuIMM, // ...right (unsigned) + VRSHRNIMM, // ...right narrow + + // Vector saturating shift by immediate: + VQSHLsIMM, // ...left (signed) + VQSHLuIMM, // ...left (unsigned) + VQSHLsuIMM, // ...left (signed to unsigned) + VQSHRNsIMM, // ...right narrow (signed) + VQSHRNuIMM, // ...right narrow (unsigned) + VQSHRNsuIMM, // ...right narrow (signed to unsigned) + + // Vector saturating rounding shift by immediate: + VQRSHRNsIMM, // ...right narrow (signed) + VQRSHRNuIMM, // ...right narrow (unsigned) + VQRSHRNsuIMM, // ...right narrow (signed to unsigned) + + // Vector shift and insert: + VSLIIMM, // ...left + VSRIIMM, // ...right + + // Vector get lane (VMOV scalar to ARM core register) + // (These are used for 8- and 16-bit element types only.) + VGETLANEu, // zero-extend vector extract element + VGETLANEs, // sign-extend vector extract element + + // Vector move immediate and move negated immediate: + VMOVIMM, + VMVNIMM, + + // Vector move f32 immediate: + VMOVFPIMM, + + // Move H <-> R, clearing top 16 bits + VMOVrh, + VMOVhr, + + // Vector duplicate: + VDUP, + VDUPLANE, + + // Vector shuffles: + VEXT, // extract + VREV64, // reverse elements within 64-bit doublewords + VREV32, // reverse elements within 32-bit words + VREV16, // reverse elements within 16-bit halfwords + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) + VTRN, // transpose + VTBL1, // 1-register shuffle with mask + VTBL2, // 2-register shuffle with mask + VMOVN, // MVE vmovn + + // MVE Saturating truncates + VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s) + VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u) + + // MVE float <> half converts + VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top + // lanes + VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes + + // Vector multiply long: + VMULLs, // ...signed + VMULLu, // ...unsigned + + VQDMULH, // MVE vqdmulh instruction + + // MVE reductions + VADDVs, // sign- or zero-extend the elements of a vector to i32, + VADDVu, // add them all together, and return an i32 of their sum + VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask + VADDVpu, + VADDLVs, // sign- or zero-extend elements to i64 and sum, returning + VADDLVu, // the low and high 32-bit halves of the sum + VADDLVAs, // Same as VADDLV[su] but also add an input accumulator + VADDLVAu, // provided as low and high halves + VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask + VADDLVpu, + VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask + VADDLVApu, + VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply + // them + VMLAVu, // and add the results together, returning an i32 of their sum + VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask + VMLAVpu, + VMLALVs, // Same as VMLAV but with i64, returning the low and + VMLALVu, // high 32-bit halves of the sum + VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask + VMLALVpu, + VMLALVAs, // Same as VMLALV but also add an input accumulator + VMLALVAu, // provided as low and high halves + VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask + VMLALVApu, + VMINVu, // Find minimum unsigned value of a vector and register + VMINVs, // Find minimum signed value of a vector and register + VMAXVu, // Find maximum unsigned value of a vector and register + VMAXVs, // Find maximum signed value of a vector and register + + SMULWB, // Signed multiply word by half word, bottom + SMULWT, // Signed multiply word by half word, top + UMLAL, // 64bit Unsigned Accumulate Multiply + SMLAL, // 64bit Signed Accumulate Multiply + UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply + SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 + SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 + SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 + SMLALTT, // 64-bit signed accumulate multiply top, top 16 + SMLALD, // Signed multiply accumulate long dual + SMLALDX, // Signed multiply accumulate long dual exchange + SMLSLD, // Signed multiply subtract long dual + SMLSLDX, // Signed multiply subtract long dual exchange + SMMLAR, // Signed multiply long, round and add + SMMLSR, // Signed multiply long, subtract and round + + // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b + // stands for. + QADD8b, + QSUB8b, + QADD16b, + QSUB16b, + + // Operands of the standard BUILD_VECTOR node are not legalized, which + // is fine if BUILD_VECTORs are always lowered to shuffles or other + // operations, but for ARM some BUILD_VECTORs are legal as-is and their + // operands need to be legalized. Define an ARM-specific version of + // BUILD_VECTOR for this purpose. + BUILD_VECTOR, + + // Bit-field insert + BFI, + + // Vector OR with immediate + VORRIMM, + // Vector AND with NOT of immediate + VBICIMM, + + // Pseudo vector bitwise select + VBSP, + + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MEMCPY, + // Pseudo-instruction representing a memory copy using a tail predicated + // loop + MEMCPYLOOP, + + // V8.1MMainline condition select + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + + // Vector load N-element structure to all lanes: + VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD2DUP, + VLD3DUP, + VLD4DUP, + + // NEON loads with post-increment base updates: + VLD1_UPD, + VLD2_UPD, + VLD3_UPD, + VLD4_UPD, + VLD2LN_UPD, + VLD3LN_UPD, + VLD4LN_UPD, + VLD1DUP_UPD, + VLD2DUP_UPD, + VLD3DUP_UPD, + VLD4DUP_UPD, + + // NEON stores with post-increment base updates: + VST1_UPD, + VST2_UPD, + VST3_UPD, + VST4_UPD, + VST2LN_UPD, + VST3LN_UPD, + VST4LN_UPD, + + // Load/Store of dual registers + LDRD, + STRD + }; } // end namespace ARMISD diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1813,6 +1813,8 @@ case ARMISD::CSINV: return "ARMISD::CSINV"; case ARMISD::CSNEG: return "ARMISD::CSNEG"; case ARMISD::CSINC: return "ARMISD::CSINC"; + case ARMISD::MEMCPYLOOP: + return "ARMISD::MEMCPYLOOP"; } return nullptr; } @@ -11071,6 +11073,141 @@ return true; } +/// Adds logic in loop entry MBB to calculate loop iteration count and adds +/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop +static Register genTPEntry(MachineBasicBlock *TpEntry, + MachineBasicBlock *TpLoopBody, + MachineBasicBlock *TpExit, Register OpSizeReg, + const TargetInstrInfo *TII, DebugLoc Dl, + MachineRegisterInfo &MRI) { + + // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16. + Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) + .addUse(OpSizeReg) + .addImm(15) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg) + .addUse(AddDestReg, RegState::Kill) + .addImm(16) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); + BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) + .addUse(BicDestReg, RegState::Kill) + .addImm(4) + .add(predOps(ARMCC::AL)) + .addReg(0); + + Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); + BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) + .addUse(LsrDestReg, RegState::Kill); + + BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) + .addUse(TotalIterationsReg) + .addMBB(TpExit); + + return TotalIterationsReg; +} + +/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and +/// t2DoLoopEnd. These are used by later passes to generate tail predicated +/// loops. +static void genTPLoopBody(MachineBasicBlock *TpLoopBody, + MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, + const TargetInstrInfo *TII, DebugLoc Dl, + MachineRegisterInfo &MRI, Register OpSrcReg, + Register OpDestReg, Register ElementCountReg, + Register TotalIterationsReg) { + + // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop + // iteration counter, predication counter Current position in the src array + Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) + .addUse(OpSrcReg) + .addMBB(TpEntry) + .addUse(CurrSrcReg) + .addMBB(TpLoopBody); + + // Current position in the dest array + Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) + .addUse(OpDestReg) + .addMBB(TpEntry) + .addUse(CurrDestReg) + .addMBB(TpLoopBody); + + // Current loop counter + Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); + Register RemainingLoopIterationsReg = + MRI.createVirtualRegister(&ARM::GPRlrRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) + .addUse(TotalIterationsReg) + .addMBB(TpEntry) + .addUse(RemainingLoopIterationsReg) + .addMBB(TpLoopBody); + + // Predication counter + Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) + .addUse(ElementCountReg) + .addMBB(TpEntry) + .addUse(RemainingElementsReg) + .addMBB(TpLoopBody); + + // Pass predication counter to VCTP + Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) + .addUse(PredCounterPhiReg) + .addImm(ARMVCC::None) + .addReg(0); + + BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) + .addUse(PredCounterPhiReg) + .addImm(16) + .add(predOps(ARMCC::AL)) + .addReg(0); + + // VLDRB and VSTRB instructions, predicated using VPR + Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); + BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) + .addDef(CurrSrcReg) + .addDef(LoadedValueReg) + .addReg(SrcPhiReg) + .addImm(16) + .addImm(ARMVCC::Then) + .addUse(VccrReg); + + BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) + .addDef(CurrDestReg) + .addUse(LoadedValueReg, RegState::Kill) + .addReg(DestPhiReg) + .addImm(16) + .addImm(ARMVCC::Then) + .addUse(VccrReg); + + // Add the pseudoInstrs for decrementing the loop counter and marking the + // end:t2DoLoopDec and t2DoLoopEnd + BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) + .addUse(LoopCounterPhiReg) + .addImm(1); + + BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) + .addUse(RemainingLoopIterationsReg) + .addMBB(TpLoopBody); + + BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) + .addMBB(TpExit) + .add(predOps(ARMCC::AL)); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -11097,6 +11234,82 @@ return BB; } + case ARM::MVE_MEMCPYLOOPINST: { + + // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction + // into a Tail Predicated (TP) Loop. It adds the instructions to calculate + // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and + // adds the relevant instructions in the TP loop Body for generation of a + // WLSTP loop. + + // Below is relevant portion of the CFG after the transformation. + // The Machine Basic Blocks are shown along with branch conditions (in + // brackets). Note that TP entry/exit MBBs depict the entry/exit of this + // portion of the CFG and may not necessarily be the entry/exit of the + // function. + + // (Relevant) CFG after transformation: + // TP entry MBB + // | + // |-----------------| + // (n <= 0) (n > 0) + // | | + // | TP loop Body MBB + // \ | + // \ / + // TP exit MBB + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + Register OpDestReg = MI.getOperand(0).getReg(); + Register OpSrcReg = MI.getOperand(1).getReg(); + Register OpSizeReg = MI.getOperand(2).getReg(); + + // Allocate the required MBBs and add to parent function. + MachineBasicBlock *TpEntry = BB; + MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); + MachineBasicBlock *TpExit; + + MF->push_back(TpLoopBody); + + // If any instructions are present in the current block after + // MVE_MEMCPYLOOPINST, move them into the exit block. This is required since + // a terminator(t2WhileLoopStart) will be placed at that site. If no + // instructions are present after MVE_MEMCPYLOOPINST, then fallthrough is + // the exit. + TpExit = BB->splitAt(MI, false); + if (TpExit == BB) { + assert(BB->canFallThrough() && + "exit Block must be Fallthrough of the block containing memcpy"); + TpExit = BB->getFallThrough(); + } + + // Add logic for iteration count + Register TotalIterationsReg = + genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); + + // Add the vectorized (and predicated) loads/store instructions + genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, + OpDestReg, OpSizeReg, TotalIterationsReg); + + // Connect the blocks + TpEntry->addSuccessor(TpLoopBody); + TpLoopBody->addSuccessor(TpLoopBody); + TpLoopBody->addSuccessor(TpExit); + + // Reorder for a more natural layout + TpLoopBody->moveAfter(TpEntry); + TpExit->moveAfter(TpLoopBody); + + // Finally, remove the memcpy Psuedo Instruction + MI.eraseFromParent(); + + // Return the exit block as it may contain other instructions requiring a + // custom inserter + return TpExit; + } + // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -6864,6 +6864,18 @@ let isTerminator = 1; } +def SDT_MVEMEMCPYLOOPNODE + : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; +def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; + +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def MVE_MEMCPYLOOPINST : PseudoInst<(outs), + (ins rGPR:$dst, rGPR:$src, rGPR:$sz), + NoItinerary, + [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>; +} + def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>; def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>; def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>; diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -16,6 +16,7 @@ #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/Support/CommandLine.h" namespace llvm { diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -17,6 +17,11 @@ #define DEBUG_TYPE "arm-selectiondag-info" +static cl::opt + EnableMemcpyTPLoop("arm-memcpy-tploop", cl::Hidden, + cl::desc("Enable/disable conversion of llvm.memcpy to " + "Tail predicated loops (WLSTP)")); + // Emit, if possible, a specialized version of the given Libcall. Typically this // means selecting the appropriately aligned version, but we also convert memset // of 0 into memclr. @@ -130,13 +135,31 @@ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const ARMSubtarget &Subtarget = DAG.getMachineFunction().getSubtarget(); + ConstantSDNode *ConstantSize = dyn_cast(Size); + + auto GenInlineTP = [=](const ARMSubtarget &Subtarget, + const SelectionDAG &DAG) { + return Subtarget.hasMVEIntegerOps() && + !DAG.getMachineFunction().getFunction().hasOptNone() && + ((!ConstantSize && (Alignment >= Align(4))) || + (ConstantSize && + ConstantSize->getZExtValue() > + Subtarget.getMaxInlineSizeThreshold() && + ConstantSize->getZExtValue() < + Subtarget.getMaxTPLoopInlineSizeThreshold())); + }; + + if ((EnableMemcpyTPLoop == cl::BOU_TRUE) || + (EnableMemcpyTPLoop == cl::BOU_UNSET && GenInlineTP(Subtarget, DAG))) + return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src, + Size); + // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if (Alignment < Align(4)) return SDValue(); // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); if (!ConstantSize) return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Alignment.value(), RTLIB::MEMCPY); diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -538,6 +538,11 @@ return 64; } + /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size + /// that still makes it profitable to inline the call as a Tail + /// Predicated loop + unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; } + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -4,31 +4,36 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) { ; CHECK-LABEL: test_memcpy: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB0_3 +; CHECK-NEXT: blt .LBB0_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: lsls r4, r3, #2 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: lsl.w r12, r3, #2 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_2: @ %for.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r0, r7, r6 -; CHECK-NEXT: add.w r1, r9, r6 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: bl __aeabi_memcpy4 -; CHECK-NEXT: add r6, r4 -; CHECK-NEXT: subs r5, #1 -; CHECK-NEXT: bne .LBB0_2 -; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB0_4 Depth 2 +; CHECK-NEXT: adds r4, r1, r7 +; CHECK-NEXT: adds r5, r0, r7 +; CHECK-NEXT: mov r6, r3 +; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_3: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: add r7, r12 +; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: beq .LBB0_5 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_4: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrb.u8 q0, [r4], #16 +; CHECK-NEXT: vstrb.8 q0, [r5], #16 +; CHECK-NEXT: letp lr, .LBB0_4 +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 --verify-machineinstrs %s -o - | FileCheck %s + +; Check that WLSTP loop is not generated for alignment < 4 +; void test1(char* dest, char* src, int n){ +; memcpy(dest, src, n); +; } + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1 + +define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){ +; CHECK-LABEL: test1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl __aeabi_memcpy +; CHECK-NEXT: pop {r7, pc} +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false) + ret void +} + + +; Check that WLSTP loop is generated for alignment >= 4 +; void test2(int* restrict X, int* restrict Y, int n){ +; memcpy(X, Y, n); +; } + + +define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){ +; CHECK-LABEL: test2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_2 +; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: .LBB1_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i32* %X to i8* + %1 = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false) + ret void +} + + +; Checks that transform handles some arithmetic on the input arguments. +; void test3(int* restrict X, int* restrict Y, int n) +; { +; memcpy(X+2, Y+3, (n*2)+10); +; } + +define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) { +; CHECK-LABEL: test3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #10 +; CHECK-NEXT: add.w r2, r3, r2, lsl #1 +; CHECK-NEXT: adds r0, #8 +; CHECK-NEXT: adds r1, #12 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB2_2 +; CHECK-NEXT: .LBB2_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: .LBB2_2: @ %entry +; CHECK-NEXT: pop {r7, pc} +entry: + %add.ptr = getelementptr inbounds i32, i32* %X, i32 2 + %0 = bitcast i32* %add.ptr to i8* + %add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3 + %1 = bitcast i32* %add.ptr1 to i8* + %mul = shl nsw i32 %n, 1 + %add = add nsw i32 %mul, 10 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false) + ret void +} + + +; Checks that transform handles for loops that are implicitly converted to mempcy +; void test4(int* restrict X, int* restrict Y, int n){ +; for(int i = 0; i < n; ++i){ +; X[i] = Y[i]; +; } +; } + +define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) { +; CHECK-LABEL: test4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %for.body.preheader +; CHECK-NEXT: wlstp.8 lr, r2, .LBB3_3 +; CHECK-NEXT: .LBB3_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %X.bits = bitcast i32* %X to i8* + %Y.bits = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void +} + diff --git a/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve_tp_loop.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O1 -mtriple=arm-arm-none-eabi -mcpu=cortex-m55 -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s +--- | + ; ModuleID = 'llvm/test/CodeGen/Thumb2/mve_tp_loop.ll' + source_filename = "llvm/test/CodeGen/Thumb2/mve_tp_loop.ll" + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "arm-arm-none-eabi" + + ; Function Attrs: argmemonly nofree nosync nounwind willreturn + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #0 + + define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 { + entry: + %0 = bitcast i32* %X to i8* + %1 = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false) + ret void + } + + define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) #1 { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %X.bits = bitcast i32* %X to i8* + %Y.bits = bitcast i32* %Y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.body.preheader, %entry + ret void + } + + attributes #0 = { argmemonly nofree nosync nounwind willreturn "target-cpu"="cortex-m55" } + attributes #1 = { "target-cpu"="cortex-m55" } + +... +--- +name: test1 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $r0, $r1, $r2 + + ; CHECK-LABEL: name: test1 + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr + ; CHECK: .1: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: .2.entry: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + %2:rgpr = COPY $r2 + %1:rgpr = COPY $r1 + %0:rgpr = COPY $r0 + MVE_MEMCPYLOOPINST %0, %1, %2 + tBX_RET 14 /* CC::al */, $noreg + +... +--- +name: test2 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test2 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 + ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + ; CHECK: bb.1.for.body.preheader: + ; CHECK: successors: %bb.2(0x80000000), %bb.3(0x00000000) + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]] + ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr + ; CHECK: bb.3: + ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3 + ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3 + ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3 + ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3 + ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg + ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]] + ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1 + ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $r0, $r1, $r2 + + %2:rgpr = COPY $r2 + %1:rgpr = COPY $r1 + %0:rgpr = COPY $r0 + t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.2, 11 /* CC::lt */, $cpsr + t2B %bb.1, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + MVE_MEMCPYLOOPINST %0, %1, %2 + + bb.2.for.cond.cleanup: + tBX_RET 14 /* CC::al */, $noreg + +...