Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2193,239 +2193,271 @@ return true; } -// Change a chain of {load; op; store} of the same value into a simple op -// through memory of that value, if the uses of the modified value and its -// address are suitable. -// -// The tablegen pattern memory operand pattern is currently not able to match -// the case where the EFLAGS on the original operation are used. -// -// To move this to tablegen, we'll need to improve tablegen to allow flags to -// be transferred from a node in the pattern to the result node, probably with -// a new keyword. For example, we have this -// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", -// [(store (add (loadi64 addr:$dst), -1), addr:$dst), -// (implicit EFLAGS)]>; -// but maybe need something like this -// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", -// [(store (add (loadi64 addr:$dst), -1), addr:$dst), -// (transferrable EFLAGS)]>; -// -// Until then, we manually fold these and instruction select the operation -// here. -bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { - StoreSDNode *StoreNode = cast(Node); - SDValue StoredVal = StoreNode->getOperand(1); - unsigned Opc = StoredVal->getOpcode(); - - // Before we try to select anything, make sure this is memory operand size - // and opcode we can handle. Note that this must match the code below that - // actually lowers the opcodes. - EVT MemVT = StoreNode->getMemoryVT(); - if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && - MemVT != MVT::i8) - return false; - switch (Opc) { - default: - return false; - case X86ISD::INC: - case X86ISD::DEC: - case X86ISD::ADD: - case X86ISD::ADC: - case X86ISD::SUB: - case X86ISD::SBB: - case X86ISD::AND: - case X86ISD::OR: - case X86ISD::XOR: - break; + // Node fusions where some output edges come from a non-root node may result + // in child nodes violating the topological ordering of node ids. This helper + // invalidate the id (set to -1) a node (and any similarly recursively) if + // they have a node id less than Id. + static void InvalidateNodeIds(SDNode *Node, int Id) { + if (Id <= 0) + return; + SmallVector Nodes; + Nodes.push_back(Node); + while (!Nodes.empty()) { + SDNode *N = Nodes.pop_back_val(); + int NId = N->getNodeId(); + if (NId > 0 && NId < Id) { + N->setNodeId(-1); + for (auto *U : N->uses()) + Nodes.push_back(U); + } + } } - LoadSDNode *LoadNode = nullptr; - SDValue InputChain; - if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode, - InputChain)) - return false; - - SDValue Base, Scale, Index, Disp, Segment; - if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, - Segment)) - return false; - - auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, - unsigned Opc8) { - switch (MemVT.getSimpleVT().SimpleTy) { - case MVT::i64: - return Opc64; - case MVT::i32: - return Opc32; - case MVT::i16: - return Opc16; - case MVT::i8: - return Opc8; + // Change a chain of {load; op; store} of the same value into a simple op + // through memory of that value, if the uses of the modified value and its + // address are suitable. + // + // The tablegen pattern memory operand pattern is currently not able to match + // the case where the EFLAGS on the original operation are used. + // + // To move this to tablegen, we'll need to improve tablegen to allow flags to + // be transferred from a node in the pattern to the result node, probably with + // a new keyword. For example, we have this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (implicit EFLAGS)]>; + // but maybe need something like this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (transferrable EFLAGS)]>; + // + // Until then, we manually fold these and instruction select the operation + // here. + bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { + StoreSDNode *StoreNode = cast(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + // Before we try to select anything, make sure this is memory operand size + // and opcode we can handle. Note that this must match the code below that + // actually lowers the opcodes. + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && + MemVT != MVT::i8) + return false; + switch (Opc) { default: - llvm_unreachable("Invalid size!"); + return false; + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::ADD: + case X86ISD::ADC: + case X86ISD::SUB: + case X86ISD::SBB: + case X86ISD::AND: + case X86ISD::OR: + case X86ISD::XOR: + break; } - }; - MachineSDNode *Result; - switch (Opc) { - case X86ISD::INC: - case X86ISD::DEC: { - unsigned NewOpc = - Opc == X86ISD::INC - ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) - : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); - const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; - Result = - CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); - break; - } - case X86ISD::ADD: - case X86ISD::ADC: - case X86ISD::SUB: - case X86ISD::SBB: - case X86ISD::AND: - case X86ISD::OR: - case X86ISD::XOR: { - auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { - switch (Opc) { - case X86ISD::ADD: - return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, - X86::ADD8mr); - case X86ISD::ADC: - return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, - X86::ADC8mr); - case X86ISD::SUB: - return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, - X86::SUB8mr); - case X86ISD::SBB: - return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, - X86::SBB8mr); - case X86ISD::AND: - return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, - X86::AND8mr); - case X86ISD::OR: - return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); - case X86ISD::XOR: - return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, - X86::XOR8mr); - default: - llvm_unreachable("Invalid opcode!"); - } - }; - auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { - switch (Opc) { - case X86ISD::ADD: - return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); - case X86ISD::ADC: - return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); - case X86ISD::SUB: - return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); - case X86ISD::SBB: - return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); - case X86ISD::AND: - return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); - case X86ISD::OR: - return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); - case X86ISD::XOR: - return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); - default: - llvm_unreachable("Invalid opcode!"); - } - }; - auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { - switch (Opc) { - case X86ISD::ADD: - return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, - X86::ADD8mi); - case X86ISD::ADC: - return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, - X86::ADC8mi); - case X86ISD::SUB: - return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, - X86::SUB8mi); - case X86ISD::SBB: - return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, - X86::SBB8mi); - case X86ISD::AND: - return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, - X86::AND8mi); - case X86ISD::OR: - return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, - X86::OR8mi); - case X86ISD::XOR: - return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, - X86::XOR8mi); + LoadSDNode *LoadNode = nullptr; + SDValue InputChain; + if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode, + InputChain)) + return false; + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, + Segment)) + return false; + + auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, + unsigned Opc8) { + switch (MemVT.getSimpleVT().SimpleTy) { + case MVT::i64: + return Opc64; + case MVT::i32: + return Opc32; + case MVT::i16: + return Opc16; + case MVT::i8: + return Opc8; default: - llvm_unreachable("Invalid opcode!"); + llvm_unreachable("Invalid size!"); } }; - unsigned NewOpc = SelectRegOpcode(Opc); - SDValue Operand = StoredVal->getOperand(1); - - // See if the operand is a constant that we can fold into an immediate - // operand. - if (auto *OperandC = dyn_cast(Operand)) { - auto OperandV = OperandC->getAPIntValue(); - - // Check if we can shrink the operand enough to fit in an immediate (or - // fit into a smaller immediate) by negating it and switching the - // operation. - if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && - ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 && - (-OperandV).getMinSignedBits() <= 8) || - (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && - (-OperandV).getMinSignedBits() <= 32)) && - hasNoCarryFlagUses(StoredVal.getNode())) { - OperandV = -OperandV; - Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; - } + MachineSDNode *Result; + switch (Opc) { + case X86ISD::INC: + case X86ISD::DEC: { + unsigned NewOpc = + Opc == X86ISD::INC + ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) + : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); + const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, + Ops); + break; + } + case X86ISD::ADD: + case X86ISD::ADC: + case X86ISD::SUB: + case X86ISD::SBB: + case X86ISD::AND: + case X86ISD::OR: + case X86ISD::XOR: { + auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, + X86::ADD8mr); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, + X86::ADC8mr); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, + X86::SUB8mr); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, + X86::SBB8mr); + case X86ISD::AND: + return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, + X86::AND8mr); + case X86ISD::OR: + return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, + X86::OR8mr); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, + X86::XOR8mr); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); + case X86ISD::AND: + return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); + case X86ISD::OR: + return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, + X86::ADD8mi); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, + X86::ADC8mi); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, + X86::SUB8mi); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, + X86::SBB8mi); + case X86ISD::AND: + return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, + X86::AND8mi); + case X86ISD::OR: + return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, + X86::OR8mi); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, + X86::XOR8mi); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + + unsigned NewOpc = SelectRegOpcode(Opc); + SDValue Operand = StoredVal->getOperand(1); + + // See if the operand is a constant that we can fold into an immediate + // operand. + if (auto *OperandC = dyn_cast(Operand)) { + auto OperandV = OperandC->getAPIntValue(); + + // Check if we can shrink the operand enough to fit in an immediate (or + // fit into a smaller immediate) by negating it and switching the + // operation. + if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && + ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 && + (-OperandV).getMinSignedBits() <= 8) || + (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && + (-OperandV).getMinSignedBits() <= 32)) && + hasNoCarryFlagUses(StoredVal.getNode())) { + OperandV = -OperandV; + Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; + } - // First try to fit this into an Imm8 operand. If it doesn't fit, then try - // the larger immediate operand. - if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) { - Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); - NewOpc = SelectImm8Opcode(Opc); - } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() && - (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) { - Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); - NewOpc = SelectImmOpcode(Opc); + // First try to fit this into an Imm8 operand. If it doesn't fit, then + // try the larger immediate operand. + if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) { + Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); + NewOpc = SelectImm8Opcode(Opc); + } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() && + (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) { + Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); + NewOpc = SelectImmOpcode(Opc); + } } - } - if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { - SDValue CopyTo = - CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, - StoredVal.getOperand(2), SDValue()); + if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { + SDValue CopyTo = + CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, + StoredVal.getOperand(2), SDValue()); - const SDValue Ops[] = {Base, Scale, Index, Disp, - Segment, Operand, CopyTo, CopyTo.getValue(1)}; - Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, - Ops); - } else { - const SDValue Ops[] = {Base, Scale, Index, Disp, - Segment, Operand, InputChain}; - Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, - Ops); + const SDValue Ops[] = {Base, Scale, Index, Disp, + Segment, Operand, CopyTo, CopyTo.getValue(1)}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, + MVT::Other, Ops); + } else { + const SDValue Ops[] = {Base, Scale, Index, Disp, + Segment, Operand, InputChain}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, + MVT::Other, Ops); + } + break; + } + default: + llvm_unreachable("Invalid opcode!"); } - break; - } - default: - llvm_unreachable("Invalid opcode!"); - } - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); - MemOp[0] = StoreNode->getMemOperand(); - MemOp[1] = LoadNode->getMemOperand(); - Result->setMemRefs(MemOp, MemOp + 2); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + Result->setMemRefs(MemOp, MemOp + 2); + + // Update Load Chain uses as well. + ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + + // If there are uses of StoredVal:1 we need to make sure the + // topological ordering of Ids has been maintained. invalidate Ids if + // necessary. + int EffectiveId = std::max(StoreNode->getNodeId(), StoredVal->getNodeId()); + for (auto UI = StoredVal->use_begin(), E = StoredVal->use_end(); UI != E; + ++UI) { + if (UI.getUse().getResNo() == 1) + InvalidateNodeIds(*UI, EffectiveId); + } - // Update Load Chain uses as well. - ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); - ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); - ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); - CurDAG->RemoveDeadNode(Node); - return true; + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + CurDAG->RemoveDeadNode(Node); + return true; } // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. Index: llvm/test/CodeGen/X86/pr36212.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr36212.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +%struct.anon = type { i32, i32 } + +@c = common global %struct.anon zeroinitializer, align 4 +@d = local_unnamed_addr global %struct.anon* @c, align 8 +@a = common local_unnamed_addr global i32 0, align 4 +@b = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind uwtable +define void @g() local_unnamed_addr #0 { +; CHECK-LABEL: g: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movl 4(%rax), %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: incl {{.*}}(%rip) +; CHECK-NEXT: setne %cl +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %0 = load %struct.anon*, %struct.anon** @d, align 8 + %y = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1 + %1 = load i32, i32* %y, align 4 + %2 = load i32, i32* @b, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* @b, align 4 + %tobool = icmp ne i32 %inc, 0 + %land.ext = zext i1 %tobool to i32 + %add = add nsw i32 %1, %land.ext + store i32 %add, i32* @a, align 4 + ret void +}