Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13067,22 +13067,6 @@ LoadSDNode *LD = cast(V->getOperand(0)); if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. - // The store should be chained directly to the load or be an operand of a - // tokenfactor. - if (LD == Chain.getNode()) - ; // ok. - else if (Chain->getOpcode() != ISD::TokenFactor) - return Result; // Fail. - else { - bool isOk = false; - for (const SDValue &ChainOp : Chain->op_values()) - if (ChainOp.getNode() == LD) { - isOk = true; - break; - } - if (!isOk) return Result; - } - // This only handles simple types. if (V.getValueType() != MVT::i16 && V.getValueType() != MVT::i32 && @@ -13119,6 +13103,24 @@ // is aligned the same as the access width. if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; + // For narrowing to be valid, it must be the case that the load the + // immediately preceeding memory operation before the store. + if (LD == Chain.getNode()) + ; // ok. + else if (Chain->getOpcode() == ISD::TokenFactor && + SDValue(LD, 1).hasOneUse()) { + // LD has only 1 chain use so they are no indirect dependencies. + bool isOk = false; + for (const SDValue &ChainOp : Chain->op_values()) + if (ChainOp.getNode() == LD) { + isOk = true; + break; + } + if (!isOk) + return Result; + } else + return Result; // Fail. + Result.first = MaskedBytes; Result.second = NotMaskTZ/8; return Result; Index: llvm/trunk/test/CodeGen/X86/pr37826.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr37826.ll +++ llvm/trunk/test/CodeGen/X86/pr37826.ll @@ -9,19 +9,12 @@ @e = common local_unnamed_addr global i32 0, align 4 @.str.1 = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 -; FIXME The generated code here is incorrect. We should only see a -; single store to f (a bytes store to f+3). +; We should only see a single store to f (a bytes store to f+3). define void @k(i32 %l) { ; CHECK-LABEL: k: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{.*}}(%rip), %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF -; CHECK-NEXT: xorl {{.*}}(%rip), %ecx -; CHECK-NEXT: xorl $2097151, %ecx # imm = 0x1FFFFF -; CHECK-NEXT: andl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: orl %ecx, %eax -; CHECK-NEXT: movl %eax, {{.*}}(%rip) +; CHECK-NEXT: orl {{.*}}(%rip), %eax ; CHECK-NEXT: shrl $24, %eax ; CHECK-NEXT: movb %al, f+{{.*}}(%rip) ; CHECK-NEXT: retq