Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2093,11 +2093,13 @@ case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: + case AArch64::STRQpre: case AArch64::STRXui: case AArch64::STRWui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: + case AArch64::LDRQpre: case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: @@ -2208,23 +2210,35 @@ // Is this a candidate for ld/st merging or pairing? For example, we don't // touch volatiles or load/stores that have a hint to avoid pair formation. bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { - // If this is a volatile load/store, don't mess with it. - if (MI.hasOrderedMemoryRef()) + // If this is a volatile load/store, don't mess with it unless if the + // instruction is a LDRQpre. + if (MI.hasOrderedMemoryRef() && MI.getOpcode() != AArch64::LDRQpre) return false; // Make sure this is a reg/fi+imm (as opposed to an address reloc). assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && "Expected a reg or frame index operand."); - if (!MI.getOperand(2).isImm()) + + // For Pre-indexed addressing quadword instructions, + // the third opperand is the immediate value. + bool isPreLdSt = ((MI.getOpcode() == AArch64::STRQpre) || + (MI.getOpcode() == AArch64::LDRQpre)) && + MI.getOperand(3).isImm(); + + if (!MI.getOperand(2).isImm() && !isPreLdSt) return false; // Can't merge/pair if the instruction modifies the base register. // e.g., ldr x0, [x0] // This case will never occur with an FI base. + // However, if the instruction is an LDRQpre, it can be merged, e.g., + // ldr q0, [x11, #32]! + // ldr q1, [x11, #16] + // to ldp q0, q1, [x11, #32]! if (MI.getOperand(1).isReg()) { Register BaseReg = MI.getOperand(1).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (MI.modifiesRegister(BaseReg, TRI)) + if (MI.modifiesRegister(BaseReg, TRI) && MI.getOpcode() != AArch64::LDRQpre) return false; } @@ -2682,7 +2696,9 @@ case AArch64::STRQui: case AArch64::STURQi: case AArch64::LDPQi: + case AArch64::LDRQpre: case AArch64::STPQi: + case AArch64::STRQpre: case AArch64::STGOffset: case AArch64::STZGOffset: case AArch64::ST2GOffset: Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -249,6 +249,7 @@ case AArch64::STRDui: case AArch64::STURDi: case AArch64::STRQui: + case AArch64::STRQpre: case AArch64::STURQi: case AArch64::STRBBui: case AArch64::STURBBi: @@ -261,6 +262,7 @@ case AArch64::LDRDui: case AArch64::LDURDi: case AArch64::LDRQui: + case AArch64::LDRQpre: case AArch64::LDURQi: case AArch64::LDRWui: case AArch64::LDURWi: @@ -310,6 +312,8 @@ case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; + case AArch64::STRQpre: + return AArch64::STPQpre; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -325,6 +329,8 @@ case AArch64::LDRQui: case AArch64::LDURQi: return AArch64::LDPQi; + case AArch64::LDRQpre: + return AArch64::LDPQpre; case AArch64::LDRWui: case AArch64::LDURWi: return AArch64::LDPWi; @@ -539,15 +545,30 @@ } } +// Returns whether the instruction is a pre-index load/store. +static bool isPreLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STRQpre: + case AArch64::LDRQpre: + return true; + } +} + // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { bool IsPaired = isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); + bool IsPreLdSt = isPreLdSt(MI); + // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. // All other pre/post indexed ldst instructions are unscaled. - Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; + Scale = (IsTagStore || IsPaired || IsPreLdSt) + ? AArch64InstrInfo::getMemScale(MI) + : 1; if (IsPaired) { MinOffset = -64; @@ -561,17 +582,20 @@ static MachineOperand &getLdStRegOp(MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); - unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + bool IsPreLdSt = isPreLdSt(MI); + if (IsPreLdSt) + PairedRegOp = 1; + unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + unsigned Idx = isPairedLdSt(MI) || isPreLdSt(MI) ? 2 : 1; return MI.getOperand(Idx); } static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + unsigned Idx = isPairedLdSt(MI) || isPreLdSt(MI) ? 3 : 2; return MI.getOperand(Idx); } @@ -940,13 +964,25 @@ MI.clearRegisterKills(Reg, TRI); } } - MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) - .add(RegOp0) - .add(RegOp1) - .add(BaseRegOp) - .addImm(OffsetImm) - .cloneMergedMemRefs({&*I, &*Paired}) - .setMIFlags(I->mergeFlagsWith(*Paired)); + + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); + + unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc); + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode)); + + if (MatchPairOpcode == AArch64::STPQpre || + MatchPairOpcode == AArch64::LDPQpre) { + MIB.addReg(BaseRegOp.getReg(), RegState::Define); + } else + assert(Scale==1 && "The scale for non pre/post indexed variants must be 1."); + + MIB.add(RegOp0) + .add(RegOp1) + .add(BaseRegOp) + .addImm(OffsetImm / Scale) + .cloneMergedMemRefs({&*I, &*Paired}) + .setMIFlags(I->mergeFlagsWith(*Paired)); (void)MIB; @@ -1225,8 +1261,10 @@ return false; // We should have already checked FirstMI for pair suppression and volatility. - assert(!FirstMI.hasOrderedMemoryRef() && - !TII->isLdStPairSuppressed(FirstMI) && + // Ignore if the instruction is LDRQpre. + assert(((!FirstMI.hasOrderedMemoryRef() && + !TII->isLdStPairSuppressed(FirstMI)) || + FirstMI.getOpcode() == AArch64::LDRQpre) && "FirstMI shouldn't get here if either of these checks are true."); unsigned OpcA = FirstMI.getOpcode(); @@ -1257,6 +1295,11 @@ if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) return false; + // The STRQpre-STRQui and LDRQpre-LDRQui are candidate pairs that can be merged. + if ((OpcA == AArch64::STRQpre && OpcB == AArch64::STRQui) || + (OpcA == AArch64::LDRQpre && OpcB == AArch64::LDRQui)) + return true; + // Try to match an unscaled load/store with a scaled load/store. return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); @@ -1643,7 +1686,17 @@ // Update list of instructions that read/write memory. if (MI.mayLoadOrStore()) MemInsns.push_back(&MI); + + // The STRQpre and LDRQpre instructions can be merged with STRQui and LDRQui, respectively. + if (((FirstMI.getOpcode() == AArch64::STRQpre) && + (MI.getOpcode() == AArch64::STRQui)) || + ((FirstMI.getOpcode() == AArch64::LDRQpre) && + (MI.getOpcode() == AArch64::LDRQui))) { + Flags.setMergeForward(true); + return MBBI; + } } + return E; } Index: llvm/test/CodeGen/AArch64/ldrqpre-ldrqui-merge.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/ldrqpre-ldrqui-merge.mir @@ -0,0 +1,250 @@ +# RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -lsr-preferred-addressing-mode=preindexed -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s +# CHECK: early-clobber $x11, renamable $q0, renamable $q1 = LDPQpre renamable $x11, 2 +# CHECK-NEXT: early-clobber $x10, renamable $q2, renamable $q3 = LDPQpre renamable $x10, 2 + +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-none-unknown-eabi" + + ; Function Attrs: nofree norecurse nounwind uwtable mustprogress + define dso_local void @ldrqpre-ldrqui-merge(float* nocapture readonly %S, float* nocapture %D, i32 %N) local_unnamed_addr #0 { + entry: + %cmp11.not = icmp eq i32 %N, 0 + br i1 %cmp11.not, label %for.cond.cleanup, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + %min.iters.check = icmp ult i32 %N, 8 + br i1 %min.iters.check, label %for.body.preheader2, label %vector.memcheck + + for.body.preheader2: ; preds = %middle.block, %vector.memcheck, %for.body.preheader + %indvars.iv.ph = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ], [ 0, %vector.memcheck ] + %0 = sub i64 %wide.trip.count, %indvars.iv.ph + %1 = add nsw i64 %indvars.iv.ph, -1 + %scevgep3 = getelementptr float, float* %S, i64 %1 + %scevgep7 = getelementptr float, float* %D, i64 %1 + br label %for.body + + vector.memcheck: ; preds = %for.body.preheader + %scevgep = getelementptr float, float* %D, i64 %wide.trip.count + %scevgep16 = getelementptr float, float* %S, i64 %wide.trip.count + %bound0 = icmp ugt float* %scevgep16, %D + %bound1 = icmp ugt float* %scevgep, %S + %found.conflict = and i1 %bound0, %bound1 + br i1 %found.conflict, label %for.body.preheader2, label %vector.ph + + vector.ph: ; preds = %vector.memcheck + %n.vec = and i64 %wide.trip.count, 4294967288 + %scevgep13 = getelementptr float, float* %S, i64 -8 + %scevgep19 = getelementptr float, float* %D, i64 -8 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv20 = phi float* [ %scevgep21, %vector.body ], [ %scevgep19, %vector.ph ] + %lsr.iv14 = phi float* [ %scevgep15, %vector.body ], [ %scevgep13, %vector.ph ] + %lsr.iv11 = phi i64 [ %lsr.iv.next12, %vector.body ], [ %n.vec, %vector.ph ] + %lsr.iv1416 = bitcast float* %lsr.iv14 to <4 x float>* + %lsr.iv2022 = bitcast float* %lsr.iv20 to <4 x float>* + %scevgep23 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 2 + %wide.load = load <4 x float>, <4 x float>* %scevgep23, align 4, !tbaa !6, !alias.scope !10, !noalias !13 + %scevgep26 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 3 + %wide.load18 = load <4 x float>, <4 x float>* %scevgep26, align 4, !tbaa !6, !alias.scope !10, !noalias !13 + %scevgep17 = getelementptr <4 x float>, <4 x float>* %lsr.iv1416, i64 2 + %wide.load19 = load <4 x float>, <4 x float>* %scevgep17, align 4, !tbaa !6, !alias.scope !13 + %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1416, i64 3 + %wide.load20 = load <4 x float>, <4 x float>* %scevgep18, align 4, !tbaa !6, !alias.scope !13 + %2 = fadd fast <4 x float> %wide.load19, %wide.load + %3 = fadd fast <4 x float> %wide.load20, %wide.load18 + %scevgep24 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 2 + store <4 x float> %2, <4 x float>* %scevgep24, align 4, !tbaa !6, !alias.scope !10, !noalias !13 + %scevgep25 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 3 + store <4 x float> %3, <4 x float>* %scevgep25, align 4, !tbaa !6, !alias.scope !10, !noalias !13 + %lsr.iv.next12 = add i64 %lsr.iv11, -8 + %scevgep15 = getelementptr float, float* %lsr.iv14, i64 8 + %scevgep21 = getelementptr float, float* %lsr.iv20, i64 8 + %4 = icmp eq i64 %lsr.iv.next12, 0 + br i1 %4, label %middle.block, label %vector.body, !llvm.loop !15 + + middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader2 + + for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void + + for.body: ; preds = %for.body.preheader2, %for.body + %lsr.iv8 = phi float* [ %scevgep7, %for.body.preheader2 ], [ %scevgep9, %for.body ] + %lsr.iv4 = phi float* [ %scevgep3, %for.body.preheader2 ], [ %scevgep5, %for.body ] + %lsr.iv = phi i64 [ %0, %for.body.preheader2 ], [ %lsr.iv.next, %for.body ] + %scevgep10 = getelementptr float, float* %lsr.iv8, i64 1 + %5 = load float, float* %scevgep10, align 4, !tbaa !6 + %scevgep6 = getelementptr float, float* %lsr.iv4, i64 1 + %6 = load float, float* %scevgep6, align 4, !tbaa !6 + %add = fadd fast float %6, %5 + store float %add, float* %scevgep10, align 4, !tbaa !6 + %lsr.iv.next = add i64 %lsr.iv, -1 + %scevgep5 = getelementptr float, float* %lsr.iv4, i64 1 + %scevgep9 = getelementptr float, float* %lsr.iv8, i64 1 + %exitcond.not = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !18 + } + + attributes #0 = { nofree norecurse nounwind uwtable mustprogress "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a55" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+v8.2a" "unsafe-fp-math"="true" } + + !llvm.module.flags = !{!0, !1, !2, !3, !4} + !llvm.ident = !{!5} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"branch-target-enforcement", i32 0} + !2 = !{i32 1, !"sign-return-address", i32 0} + !3 = !{i32 1, !"sign-return-address-all", i32 0} + !4 = !{i32 1, !"sign-return-address-with-bkey", i32 0} + !5 = !{!"clang version 13.0.0"} + !6 = !{!7, !7, i64 0} + !7 = !{!"float", !8, i64 0} + !8 = !{!"omnipotent char", !9, i64 0} + !9 = !{!"Simple C++ TBAA"} + !10 = !{!11} + !11 = distinct !{!11, !12} + !12 = distinct !{!12, !"LVerDomain"} + !13 = !{!14} + !14 = distinct !{!14, !12} + !15 = distinct !{!15, !16, !17} + !16 = !{!"llvm.loop.mustprogress"} + !17 = !{!"llvm.loop.isvectorized", i32 1} + !18 = distinct !{!18, !16, !17} + +... +--- +name: ldrqpre-ldrqui-merge +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$x0', virtual-reg: '' } + - { reg: '$x1', virtual-reg: '' } + - { reg: '$w2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + hasRedZone: false +body: | + bb.0.entry: + successors: %bb.10(0x30000000), %bb.1(0x50000000) + liveins: $w2, $x0, $x1 + + CBZW renamable $w2, %bb.10 + + bb.1.for.body.preheader: + successors: %bb.2(0x40000000), %bb.5(0x40000000) + liveins: $w2, $x0, $x1 + + renamable $w8 = ORRWrs $wzr, renamable $w2, 0, implicit-def $x8 + dead $wzr = SUBSWri killed renamable $w2, 7, 0, implicit-def $nzcv + Bcc 8, %bb.5, implicit killed $nzcv + + bb.2: + successors: %bb.3(0x80000000) + liveins: $x0, $x1, $x8 + + $x9 = ORRXrs $xzr, $xzr, 0 + + bb.3.for.body.preheader2: + successors: %bb.4(0x80000000) + liveins: $x0, $x1, $x8, $x9 + + renamable $x10 = UBFMXri renamable $x9, 62, 61 + $x8 = SUBXrs killed renamable $x8, killed renamable $x9, 0 + renamable $x10 = SUBXri killed renamable $x10, 4, 0 + $x9 = ADDXrs killed renamable $x0, renamable $x10, 0 + $x10 = ADDXrs killed renamable $x1, killed renamable $x10, 0 + + bb.4.for.body: + successors: %bb.10(0x04000000), %bb.4(0x7c000000) + liveins: $x8, $x9, $x10 + + early-clobber renamable $x10, renamable $s0 = LDRSpre killed renamable $x10, 4 + early-clobber renamable $x9, renamable $s1 = LDRSpre killed renamable $x9, 4 + renamable $s0 = nnan ninf nsz arcp contract afn reassoc FADDSrr killed renamable $s1, killed renamable $s0 + renamable $x8 = SUBSXri killed renamable $x8, 1, 0, implicit-def $nzcv + STRSui killed renamable $s0, renamable $x10, 0 :: (store 4 into %ir.scevgep10, !tbaa !6) + Bcc 1, %bb.4, implicit $nzcv + B %bb.10 + + bb.5.vector.memcheck: + successors: %bb.6(0x60000000), %bb.7(0x20000000) + liveins: $x0, $x1, $x8 + + renamable $x9 = UBFMXri renamable $x8, 62, 61 + $x10 = ADDXrs renamable $x0, renamable $x9, 0 + $xzr = SUBSXrs killed renamable $x10, renamable $x1, 0, implicit-def $nzcv, implicit-def $nzcv + Bcc 9, %bb.7, implicit $nzcv + + bb.6.vector.memcheck: + successors: %bb.2(0x55555555), %bb.7(0x2aaaaaab) + liveins: $x0, $x1, $x8, $x9 + + $x9 = ADDXrs renamable $x1, killed renamable $x9, 0 + $xzr = SUBSXrs killed renamable $x9, renamable $x0, 0, implicit-def $nzcv, implicit-def $nzcv + Bcc 8, %bb.2, implicit $nzcv + + bb.7.vector.ph: + successors: %bb.8(0x80000000) + liveins: $x0, $x1, $x8 + + renamable $x9 = ANDXri renamable $x8, 8028 + renamable $x10 = SUBXri renamable $x0, 32, 0 + renamable $x11 = SUBXri renamable $x1, 32, 0 + $x12 = ORRXrs $xzr, $x9, 0 + + bb.8.vector.body: + successors: %bb.9(0x04000000), %bb.8(0x7c000000) + liveins: $x0, $x1, $x8, $x9, $x10, $x11, $x12 + + early-clobber $x11, renamable $q0, renamable $q1 = LDPQpre renamable $x11, 2 + early-clobber $x10, renamable $q2, renamable $q3 = LDPQpre renamable $x10, 2 + renamable $q0 = nnan ninf nsz arcp contract afn reassoc FADDv4f32 killed renamable $q2, killed renamable $q0 + renamable $q1 = nnan ninf nsz arcp contract afn reassoc FADDv4f32 killed renamable $q3, killed renamable $q1 + renamable $x12 = SUBSXri killed renamable $x12, 8, 0, implicit-def $nzcv + STPQi renamable $q0, renamable $q1, renamable $x11, 0 :: (store 16 into %ir.scevgep24, align 4, !tbaa !6, !alias.scope !10, !noalias !13), (store 16 into %ir.scevgep25, align 4, !tbaa !6, !alias.scope !10, !noalias !13) + Bcc 1, %bb.8, implicit killed $nzcv + + bb.9.middle.block: + successors: %bb.10(0x40000000), %bb.3(0x40000000) + liveins: $x0, $x1, $x8, $x9 + + $xzr = SUBSXrs renamable $x9, renamable $x8, 0, implicit-def $nzcv, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + + bb.10.for.cond.cleanup: + RET undef $lr + +... Index: llvm/test/CodeGen/AArch64/strqpre-strqui-merge.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/strqpre-strqui-merge.mir @@ -0,0 +1,67 @@ +# RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -lsr-preferred-addressing-mode=preindexed -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s +# CHECK: early-clobber $x0 = STPQpre killed renamable $q0, killed renamable $q1, renamable $x0, 2 :: (store 16 into %ir.p0), (store 16 into %ir.p1) + +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-none-unknown-eabi" + + define <4 x i32>* @strqpre-strqui-merge(<4 x i32>* %p, <4 x i32> %a, <4 x i32> %b) #0 { + entry: + %p0 = getelementptr <4 x i32>, <4 x i32>* %p, i32 2 + store <4 x i32> %a, <4 x i32>* %p0, align 16 + %p1 = getelementptr <4 x i32>, <4 x i32>* %p, i32 3 + store <4 x i32> %b, <4 x i32>* %p1, align 16 + ret <4 x i32>* %p0 + } + + attributes #0 = { "target-cpu"="cortex-a55" } + +... +--- +name: strqpre-strqui-merge +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$x0', virtual-reg: '' } + - { reg: '$q0', virtual-reg: '' } + - { reg: '$q1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + hasRedZone: false +body: | + bb.0.entry: + liveins: $q0, $q1, $x0 + early-clobber $x0 = STPQpre killed renamable $q0, killed renamable $q1, renamable $x0, 2 :: (store 16 into %ir.p0), (store 16 into %ir.p1) + RET undef $lr, implicit $x0 + +...