Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13005,22 +13005,6 @@ LoadSDNode *LD = cast(V->getOperand(0)); if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. - // The store should be chained directly to the load or be an operand of a - // tokenfactor. - if (LD == Chain.getNode()) - ; // ok. - else if (Chain->getOpcode() != ISD::TokenFactor) - return Result; // Fail. - else { - bool isOk = false; - for (const SDValue &ChainOp : Chain->op_values()) - if (ChainOp.getNode() == LD) { - isOk = true; - break; - } - if (!isOk) return Result; - } - // This only handles simple types. if (V.getValueType() != MVT::i16 && V.getValueType() != MVT::i32 && @@ -13057,6 +13041,36 @@ // is aligned the same as the access width. if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; + // For narrowing to be valid, it must be the case that the load the + // immediately preceeding memory operation before the store. + if (LD == Chain.getNode()) + ; // ok. + else if (Chain->getOpcode() != ISD::TokenFactor) + return Result; // Fail. + else { + // If the chain is a token factor we can do so if LD is an operand _AND_ + // none of the other Token Factor operands are successors to LD. + SmallPtrSet Visited; + SmallVector Worklist; + + bool isOk = false; + for (const SDValue &ChainOp : Chain->op_values()) + if (ChainOp.getNode() == LD) + isOk = true; + else + Worklist.push_back(ChainOp.getNode()); + // Search non-LD operands's predecessors for LD. This is potentially + // expensive so we bound the search. To mitigate failures due to size of + // DAG, we also add LD's predecessors to the pruning list. + + for (const SDValue &LdOp : LD->op_values()) + Visited.insert(LdOp.getNode()); + + const unsigned int Max = 1024; + if (!isOk || SDNode::hasPredecessorHelper(LD, Visited, Worklist, Max)) + return Result; + } + Result.first = MaskedBytes; Result.second = NotMaskTZ/8; return Result; Index: llvm/test/CodeGen/X86/pr37826.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr37826.ll @@ -0,0 +1,42 @@ +; RUN: llc -o - %s -mtriple=x86_64--unknown-linux-gnu | FileCheck %s + +; When compiled and run this should print zero. + +@c = common local_unnamed_addr global i32 0, align 4 +@f = common local_unnamed_addr global i32 0, align 4 +@e = common local_unnamed_addr global i32 0, align 4 +@.str.1 = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; We should only see a single store to f (a bytes store to f+3). +define void @k(i32 %l) { +; CHECK-LABEL: k: +; CHECK-NOT: {{.*}}, f({{[^,]*}} +; CHECK: movb {{.*}}, f+3(%rip) +; CHECK-NOT: {{.*}}, f({{[^,]*}} +; CHECK: ret + %load = load i32, i32* @c, align 4 + %load6 = load i32, i32* @f, align 4 + %clear7 = and i32 %load6, 16777215 + store i32 %clear7, i32* @c, align 4 + %neg = and i32 %load6, 2097151 + %value = xor i32 %neg, 2097151 + store i32 %load, i32* @c, align 4 + %t0 = load i32, i32* @e, align 4 + %value15 = xor i32 %t0, %value + %clear16 = and i32 %load6, -16777216 + %set17 = or i32 %value15, %clear16 + store i32 %set17, i32* @f, align 4 + %clear25 = and i32 %set17, -16777216 + %set26 = or i32 %clear25, %clear7 + store i32 %set26, i32* @f, align 4 + ret void +} + +declare i32 @printf(i8* nocapture readonly, ...) + +define i32 @main() { + tail call void @k(i32 1) + %load = load i32, i32* @f, align 4 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %load) + ret i32 0 +}