Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11200,10 +11200,19 @@ FirstInChain->getAlignment()); // Replace the last store with the new store + DEBUG(dbgs() << "NewStore: "; NewStore->dump(); dbgs() << "NewChain: "; + NewChain->dump()); + unsigned MaxIROrder = 0; + for (unsigned i = 0; i < NumStores; ++i) + MaxIROrder = std::max(MaxIROrder, StoreNodes[i].MemNode->getIROrder()); + CombineTo(LatestOp, NewStore); // Erase all other stores. for (unsigned i = 0; i < NumStores; ++i) { - if (StoreNodes[i].MemNode == LatestOp) + DEBUG(dbgs() << "Merged Store [" << i << "]:"; + StoreNodes[i].MemNode->dump()); + if (StoreNodes[i].MemNode == LatestOp || + StoreNodes[i].MemNode->getOpcode() == ISD::DELETED_NODE) continue; StoreSDNode *St = cast(StoreNodes[i].MemNode); // ReplaceAllUsesWith will replace all uses that existed when it was @@ -11216,11 +11225,36 @@ // When we change it's chain to be St's chain they become identical, // get CSEed and the net result is that X is now a use of St. // Since we know that St is redundant, just iterate. - while (!St->use_empty()) + // For cases like + // TokF / St1 + // \ St2 -> another st + // When St1 and St2 are combined to NewStore, we need to make sure that + // "another st" is chained to NewStore instead of TokF. There are cases + // where one merged st is a use of another merged st directly or + // indirectly. Those uses should be replaced with St's chain to avoid + // cycles. So all uses of merged store that are topologically strictly + // after all merged stores should be replaced with NewStore and the rest + // uses be replaced with St's chain. + // FIXME: Since we don't have topological order here, we use IROrder. + for (auto U : St->uses()) { + DEBUG(dbgs() << "Checking User:"; U->dump();); + if (U->getIROrder() <= MaxIROrder || U->getOpcode() == ISD::TokenFactor) + continue; + if (U->getOperand(0) == NewStore) + continue; + std::vector Ops; + for (auto &o : U->ops()) + Ops.push_back(o); + Ops[0] = NewStore; // Change to chain to NewStore. + DEBUG(dbgs() << "Morphing : "; U->dump(); dbgs() << " To: ";); + DAG.MorphNodeTo(U, U->getOpcode(), U->getVTList(), Ops); + DEBUG(U->dump()); + } + if (St->getOpcode() != ISD::DELETED_NODE) { DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); + deleteAndRecombine(St); + } } - return true; } Index: test/CodeGen/AArch64/merge-store.ll =================================================================== --- test/CodeGen/AArch64/merge-store.ll +++ test/CodeGen/AArch64/merge-store.ll @@ -1,5 +1,6 @@ ; RUN: llc -march aarch64 %s -o - | FileCheck %s ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE +; RUN: llc -mcpu cortex-a53 -march aarch64 %s -o - | FileCheck %s --check-prefix=A53 @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 @@ -48,3 +49,66 @@ ; CYCLONE-NEXT: str d1, [x0, #32] ; CYCLONE-NEXT: ret } + + +; PR26827 - Merge stores causes wrong dependency. +%struct1 = type { %struct1*, %struct1*, i32, i32, i16, i16, void (i32, i32, i8*)*, i8* } +@gv0 = internal unnamed_addr global i32 0, align 4 +@gv1 = internal unnamed_addr global %struct1** null, align 8 + +define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg) { +;CHECK-LABEL: test +entry: +;A53: mov [[DATA:w[0-9]+]], w1 +;A53: str q{{[0-9]+}}, {{.*}} +;A53: str q{{[0-9]+}}, {{.*}} +;A53: str [[DATA]], {{.*}} + + %0 = bitcast %struct1* %fde to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false) + %state = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 4 + store i16 256, i16* %state, align 8 + %fd1 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 2 + store i32 %fd, i32* %fd1, align 8 + %force_eof = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 3 + store i32 0, i32* %force_eof, align 4 + %func2 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 6 + store void (i32, i32, i8*)* %func, void (i32, i32, i8*)** %func2, align 8 + %arg3 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 7 + store i8* %arg, i8** %arg3, align 8 + %call = tail call i32 (i32, i32, ...) @fcntl(i32 %fd, i32 4, i8* %0) #6 + %1 = load i32, i32* %fd1, align 8 + %cmp.i = icmp slt i32 %1, 0 + br i1 %cmp.i, label %if.then.i, label %while.body.i.preheader +if.then.i: + unreachable + +while.body.i.preheader: + %2 = load i32, i32* @gv0, align 4 + %3 = icmp eq i32* %fd1, @gv0 + br i1 %3, label %while.body.i.split, label %while.body.i.split.ver.us.preheader + +while.body.i.split.ver.us.preheader: + br label %while.body.i.split.ver.us + +while.body.i.split.ver.us: + %.reg2mem21.0 = phi i32 [ %mul.i.ver.us, %while.body.i.split.ver.us ], [ %2, %while.body.i.split.ver.us.preheader ] + %mul.i.ver.us = shl nsw i32 %.reg2mem21.0, 1 + %4 = icmp sgt i32 %mul.i.ver.us, %1 + br i1 %4, label %while.end.i, label %while.body.i.split.ver.us + +while.body.i.split: + br label %while.body.i.split + +while.end.i: + %call.i = tail call i8* @foo() + store i8* %call.i, i8** bitcast (%struct1*** @gv1 to i8**), align 8 + br label %exit + +exit: + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare i32 @fcntl(i32, i32, ...) +declare noalias i8* @foo()