Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14175,33 +14175,50 @@ continue; // Check for #1. + SmallPtrSet LaterMemOps; + SmallVector Worklist; + SmallPtrSet Visited; bool TryNext = false; for (SDNode *Use : BasePtr.getNode()->uses()) { if (Use == Ptr.getNode()) continue; + // Record other memory accesses users of this base, and record any + // which proceed N. We can then try not to combine into post-inc when + // there are later users which could do it instead. + if (isa(Use)) { + Worklist.push_back(Use); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist)) { + LaterMemOps.insert(cast(Use)); + break; + } + } + // If all the uses are load / store addresses, then don't do the // transformation. - if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { - bool RealUse = false; - for (SDNode *UseUse : Use->uses()) { - if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) - RealUse = true; - } + if (Use->getOpcode() != ISD::ADD && Use->getOpcode() != ISD::SUB) + continue; - if (!RealUse) { - TryNext = true; + bool RealUse = false; + for (SDNode *UseUse : Use->uses()) { + if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) { + RealUse = true; break; } } + + if (!RealUse) { + TryNext = true; + break; + } } - if (TryNext) + if (TryNext || !LaterMemOps.empty()) continue; // Check for #2. - SmallPtrSet Visited; - SmallVector Worklist; + Worklist.clear(); + Visited.clear(); // Ptr is predecessor to both N and Op. Visited.insert(Ptr.getNode()); Worklist.push_back(N); Index: llvm/test/CodeGen/Thumb/frame-access.ll =================================================================== --- llvm/test/CodeGen/Thumb/frame-access.ll +++ llvm/test/CodeGen/Thumb/frame-access.ll @@ -404,8 +404,8 @@ ; CHECK-NEXT: sub sp, #508 ; CHECK-NEXT: sub sp, #8 ; Argument addresses computed relative to BP -; CHECK: adds r0, r6, #7 -; CHECK-NEXT: adds r0, #13 +; CHECK: adds r4, r6, #7 +; CHECK-NEXT: adds r4, #13 ; CHECK: adds r1, r6, #7 ; CHECK-NEXT: adds r1, #9 ; CHECK: adds r5, r6, #7 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -400,18 +400,16 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r0] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %bb27 ; CHECK-NEXT: pop {r7, pc} @@ -464,13 +462,12 @@ ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vpttt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vctpt.32 r3 @@ -478,8 +475,7 @@ ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %bb32 ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -8,14 +8,13 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK: .LBB0_1: @ %vector.body -; CHECK: vldrb.s16 q0, [r1], #8 -; CHECK-NEXT: vldrh.u16 q1, [r3], #16 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -63,14 +62,13 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK: .LBB1_1: @ %vector.body -; CHECK: vldrb.u16 q0, [r1], #8 -; CHECK-NEXT: vldrh.u16 q1, [r3], #16 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u16 q0, [r1], #8 +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -118,14 +116,13 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK: .LBB2_1: @ %vector.body -; CHECK: vldrh.s32 q0, [r1], #8 -; CHECK-NEXT: vldrw.u32 q1, [r3], #16 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.s32 q0, [r1], #8 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -173,14 +170,13 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK: .LBB3_1: @ %vector.body -; CHECK: vldrh.u32 q0, [r1], #8 -; CHECK-NEXT: vldrw.u32 q1, [r3], #16 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u32 q0, [r1], #8 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -133,26 +133,24 @@ ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vdup.32 q1, r3 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: sub.w r12, r3, #1 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q2, r12 -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vdup.32 q2, r3 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vorr q2, q2, q0 ; CHECK-NEXT: vpttt.u32 cs, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 +; CHECK-NEXT: vldrwt.u32 q4, [r2] ; CHECK-NEXT: vfma.f32 q4, q3, q2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r2] -; CHECK-NEXT: mov r2, r3 +; CHECK-NEXT: vstrwt.32 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9}