Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -183,6 +183,21 @@ return isSmallTypeLdMerge(MI->getOpcode()); } +// This function is called by findMatchingInsn. +// Only check the target instruction is load/store and ScanLimit is 1 or not. +static bool isAdjacentLdStMerge(MachineInstr *MI, unsigned Limit) { + if (!MI->mayStore() && + !MI->mayLoad() && + !MI->isCall() && + !MI->hasUnmodeledSideEffects()) + return false; + + if (Limit == 1) + return true; + + return false; +} + // Scaling factor for unscaled load or store. static int getMemScale(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -820,7 +835,12 @@ // If this is a volatile load/store that otherwise matched, stop looking // as something is going on that we don't have enough information to // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + // But if the Limit is 1 which means we try to merge adjacent load/store, + // we can merge volatile load/store. There is no semantic difference + // even if the order is changed. Pairwise instruction doesn't have + // memory reference information which treated volatile access. + if ((!isAdjacentLdStMerge(MI, Limit) && MI->hasOrderedMemoryRef()) || + TII->isLdStPairSuppressed(MI)) return E; // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. @@ -1135,9 +1155,10 @@ MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = MBBI; MachineBasicBlock::iterator E = MI->getParent()->end(); - // If this is a volatile load/store, don't mess with it. + + bool isVolatile = false; if (MI->hasOrderedMemoryRef()) - return false; + isVolatile = true; // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) @@ -1150,7 +1171,16 @@ // Look ahead up to ScanLimit instructions for a pairable instruction. LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + MachineBasicBlock::iterator Paired; + if (isVolatile) { + // Only check the next instruction, if this is volatile load/store. + // We just pass 1 for ScanLimit value. That means we just check the + // next instruction for merging. + Paired = findMatchingInsn(MBBI, Flags, 1); + } else { + Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + } + if (Paired != E) { if (isSmallTypeLdMerge(MI)) { ++NumSmallTypeMerged; Index: test/CodeGen/AArch64/arm64-ldp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ldp.ll +++ test/CodeGen/AArch64/arm64-ldp.ll @@ -356,3 +356,22 @@ ret i64 %add } +; Test the load/store optimizer---combine adjacent volatile ldr +; CHECK-LABEL: volatile_ldp_long +; CHECK: ldr +; CHECK: ldp +; CHECK: ldr +define i64 @volatile_ldp_long(i64* %p) nounwind { + %add.ptr = getelementptr inbounds i64, i64* %p, i64 0 + %tmp = load volatile i64, i64* %add.ptr, align 8 + %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2 + %tmp1 = load volatile i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3 + %tmp2 = load volatile i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1 + %tmp3 = load volatile i64, i64* %add.ptr3, align 8 + %add = add nsw i64 %tmp1, %tmp + %add2 = add nsw i64 %tmp2, %add + %add3 = add nsw i64 %tmp3, %add2 + ret i64 %add3 +} Index: test/CodeGen/AArch64/arm64-platform-reg.ll =================================================================== --- test/CodeGen/AArch64/arm64-platform-reg.ll +++ test/CodeGen/AArch64/arm64-platform-reg.ll @@ -14,14 +14,14 @@ %val = load volatile [30 x i64], [30 x i64]* @var store volatile [30 x i64] %val, [30 x i64]* @var -; CHECK: ldr x18 -; CHECK: str x18 +; CHECK: ldp x17, x18 +; CHECK: stp x17, x18 -; CHECK-RESERVE-X18-NOT: ldr fp -; CHECK-RESERVE-X18-NOT: ldr x18 +; CHECK-RESERVE-X18-NOT: ldp x17, fp +; CHECK-RESERVE-X18-NOT: ldp x17, x18 ; CHECK-RESERVE-X18: Spill -; CHECK-RESERVE-X18-NOT: ldr fp -; CHECK-RESERVE-X18-NOT: ldr x18 +; CHECK-RESERVE-X18-NOT: ldp x17, fp +; CHECK-RESERVE-X18-NOT: ldp x17, x18 ; CHECK-RESERVE-X18: ret ret void } Index: test/CodeGen/AArch64/arm64-stp.ll =================================================================== --- test/CodeGen/AArch64/arm64-stp.ll +++ test/CodeGen/AArch64/arm64-stp.ll @@ -129,3 +129,20 @@ store i32 %b, i32* %add.ptr, align 4 ret i32 %tmp2 } + +; Test the load/store optimizer---combine adjacent volatile str +; CHECK-LABEL: volatile_stp_long +; CHECK: str x0, [x4] +; CHECK: stp x1, x2, [x4, #16] +; CHECK: str x3, [x4, #8] +define void @volatile_stp_long(i64 %a, i64 %b, i64 %c, i64 %d,i64* nocapture %p) nounwind { + %add.ptr = getelementptr inbounds i64, i64* %p, i64 0 + store volatile i64 %a, i64* %add.ptr, align 8 + %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2 + store volatile i64 %b, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3 + store volatile i64 %c, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1 + store volatile i64 %d, i64* %add.ptr3, align 8 + ret void +} Index: test/CodeGen/AArch64/arm64-volatile.ll =================================================================== --- test/CodeGen/AArch64/arm64-volatile.ll +++ test/CodeGen/AArch64/arm64-volatile.ll @@ -2,26 +2,45 @@ define i64 @normal_load(i64* nocapture %bar) nounwind readonly { ; CHECK: normal_load ; CHECK: ldp +; CHECK: ldp +; CHECK-NEXT: add +; CHECK-NEXT: add ; CHECK-NEXT: add ; CHECK-NEXT: ret %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1 %tmp = load i64, i64* %add.ptr, align 8 - %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2 + %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5 %tmp1 = load i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2 + %tmp2 = load i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4 + %tmp3 = load i64, i64* %add.ptr3, align 8 %add = add nsw i64 %tmp1, %tmp - ret i64 %add + %add2 = add nsw i64 %tmp3, %tmp2 + %add3 = add nsw i64 %add, %add2 + ret i64 %add3 } define i64 @volatile_load(i64* nocapture %bar) nounwind { ; CHECK: volatile_load ; CHECK: ldr ; CHECK-NEXT: ldr +; CHECK-NEXT: ldr +; CHECK-NEXT: ldr +; CHECK-NEXT: add +; CHECK-NEXT: add ; CHECK-NEXT: add ; CHECK-NEXT: ret %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1 %tmp = load volatile i64, i64* %add.ptr, align 8 - %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2 + %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5 %tmp1 = load volatile i64, i64* %add.ptr1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2 + %tmp2 = load volatile i64, i64* %add.ptr2, align 8 + %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4 + %tmp3 = load volatile i64, i64* %add.ptr3, align 8 %add = add nsw i64 %tmp1, %tmp - ret i64 %add + %add2 = add nsw i64 %tmp3, %tmp2 + %add3 = add nsw i64 %add, %add2 + ret i64 %add3 }