Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -183,6 +183,18 @@ return isSmallTypeLdMerge(MI->getOpcode()); } +// This function is called by findMatchingInsn. +// Only check the target instruction is load/store and ScanLimit is 1 or not. +static bool isRightNextLdStMerge(MachineInstr *MI, unsigned Limit) { + if (!MI->mayStore() && !MI->mayLoad() && !MI->isCall() && + !MI->hasUnmodeledSideEffects()) + return false; + + return Limit == 1; + + return false; +} + // Scaling factor for unscaled load or store. static int getMemScale(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -756,6 +768,20 @@ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); + // If first instruction is volatile load/store, we only check just + // right next instruction. For this, we just set Limit value to 1. + // That means we just check right next instruction for merging. + // e.g., + // ldr x0, [x2] -> volatile + // ldr x1, [x2, #8] -> this is just right next instruction of first. + // ; becomes + // ldp x0, x1, [x2] -> if memory reference information is empty, + // it is also treated volatile access. + // Opposite situation or both of ldr/str instructions are volatile is + // also ok. + if (FirstMI->hasOrderedMemoryRef()) + Limit = 1; + // Early exit if the first instruction modifies the base register. // e.g., ldr x0, [x0] if (FirstMI->modifiesRegister(BaseReg, TRI)) @@ -820,7 +846,13 @@ // If this is a volatile load/store that otherwise matched, stop looking // as something is going on that we don't have enough information to // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + // But if the Limit is 1 which means we try to merge just right next + // load/store instruction, we can merge volatile load/store. + // There is no semantic difference even if the order is changed. + // Pairwise instruction doesn't have memory reference information + // which treated volatile access. + if ((!isRightNextLdStMerge(MI, Limit) && MI->hasOrderedMemoryRef()) || + TII->isLdStPairSuppressed(MI)) return E; // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. @@ -1135,9 +1167,6 @@ MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = MBBI; MachineBasicBlock::iterator E = MI->getParent()->end(); - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) - return false; // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) Index: test/CodeGen/AArch64/arm64-ldp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ldp.ll +++ test/CodeGen/AArch64/arm64-ldp.ll @@ -356,3 +356,22 @@ ret i64 %add } +; Test the load/store optimizer---combine adjacent volatile ldr +; CHECK-LABEL: volatile_ldp_long +; CHECK: ldr +; CHECK: ldp +; CHECK: ldr +define i64 @volatile_ldp_long(i64* %p) nounwind { + %add.ptr = getelementptr inbounds i64, i64* %p, i64 0 + %tmp = load volatile i64, i64* %add.ptr, align 8 + %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2 + %tmp1 = load volatile i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3 + %tmp2 = load volatile i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1 + %tmp3 = load volatile i64, i64* %add.ptr3, align 8 + %add = add nsw i64 %tmp1, %tmp + %add2 = add nsw i64 %tmp2, %add + %add3 = add nsw i64 %tmp3, %add2 + ret i64 %add3 +} Index: test/CodeGen/AArch64/arm64-platform-reg.ll =================================================================== --- test/CodeGen/AArch64/arm64-platform-reg.ll +++ test/CodeGen/AArch64/arm64-platform-reg.ll @@ -14,14 +14,14 @@ %val = load volatile [30 x i64], [30 x i64]* @var store volatile [30 x i64] %val, [30 x i64]* @var -; CHECK: ldr x18 -; CHECK: str x18 +; CHECK: ldp x17, x18 +; CHECK: stp x17, x18 -; CHECK-RESERVE-X18-NOT: ldr fp -; CHECK-RESERVE-X18-NOT: ldr x18 +; CHECK-RESERVE-X18-NOT: ldp x17, fp +; CHECK-RESERVE-X18-NOT: ldp x17, x18 ; CHECK-RESERVE-X18: Spill -; CHECK-RESERVE-X18-NOT: ldr fp -; CHECK-RESERVE-X18-NOT: ldr x18 +; CHECK-RESERVE-X18-NOT: ldp x17, fp +; CHECK-RESERVE-X18-NOT: ldp x17, x18 ; CHECK-RESERVE-X18: ret ret void } Index: test/CodeGen/AArch64/arm64-stp.ll =================================================================== --- test/CodeGen/AArch64/arm64-stp.ll +++ test/CodeGen/AArch64/arm64-stp.ll @@ -129,3 +129,20 @@ store i32 %b, i32* %add.ptr, align 4 ret i32 %tmp2 } + +; Test the load/store optimizer---combine adjacent volatile str +; CHECK-LABEL: volatile_stp_long +; CHECK: str x0, [x4] +; CHECK: stp x1, x2, [x4, #16] +; CHECK: str x3, [x4, #8] +define void @volatile_stp_long(i64 %a, i64 %b, i64 %c, i64 %d,i64* nocapture %p) nounwind { + %add.ptr = getelementptr inbounds i64, i64* %p, i64 0 + store volatile i64 %a, i64* %add.ptr, align 8 + %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2 + store volatile i64 %b, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3 + store volatile i64 %c, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1 + store volatile i64 %d, i64* %add.ptr3, align 8 + ret void +} Index: test/CodeGen/AArch64/arm64-volatile.ll =================================================================== --- test/CodeGen/AArch64/arm64-volatile.ll +++ test/CodeGen/AArch64/arm64-volatile.ll @@ -2,26 +2,45 @@ define i64 @normal_load(i64* nocapture %bar) nounwind readonly { ; CHECK: normal_load ; CHECK: ldp +; CHECK: ldp +; CHECK-NEXT: add +; CHECK-NEXT: add ; CHECK-NEXT: add ; CHECK-NEXT: ret %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1 %tmp = load i64, i64* %add.ptr, align 8 - %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2 + %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5 %tmp1 = load i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2 + %tmp2 = load i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4 + %tmp3 = load i64, i64* %add.ptr3, align 8 %add = add nsw i64 %tmp1, %tmp - ret i64 %add + %add2 = add nsw i64 %tmp3, %tmp2 + %add3 = add nsw i64 %add, %add2 + ret i64 %add3 } define i64 @volatile_load(i64* nocapture %bar) nounwind { ; CHECK: volatile_load ; CHECK: ldr ; CHECK-NEXT: ldr +; CHECK-NEXT: ldr +; CHECK-NEXT: ldr +; CHECK-NEXT: add +; CHECK-NEXT: add ; CHECK-NEXT: add ; CHECK-NEXT: ret %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1 %tmp = load volatile i64, i64* %add.ptr, align 8 - %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2 + %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5 %tmp1 = load volatile i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2 + %tmp2 = load volatile i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4 + %tmp3 = load volatile i64, i64* %add.ptr3, align 8 %add = add nsw i64 %tmp1, %tmp - ret i64 %add + %add2 = add nsw i64 %tmp3, %tmp2 + %add3 = add nsw i64 %add, %add2 + ret i64 %add3 }