Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -194,7 +194,6 @@ private: SDNode *Select(SDNode *N) override; SDNode *selectGather(SDNode *N, unsigned Opc); - SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT); bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); @@ -1713,295 +1712,6 @@ return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); } -/// Atomic opcode table -/// -enum AtomicOpc { - ADD, - SUB, - INC, - DEC, - OR, - AND, - XOR, - AtomicOpcEnd -}; - -enum AtomicSz { - ConstantI8, - I8, - SextConstantI16, - ConstantI16, - I16, - SextConstantI32, - ConstantI32, - I32, - SextConstantI64, - ConstantI64, - I64, - AtomicSzEnd -}; - -static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { - { - X86::LOCK_ADD8mi, - X86::LOCK_ADD8mr, - X86::LOCK_ADD16mi8, - X86::LOCK_ADD16mi, - X86::LOCK_ADD16mr, - X86::LOCK_ADD32mi8, - X86::LOCK_ADD32mi, - X86::LOCK_ADD32mr, - X86::LOCK_ADD64mi8, - X86::LOCK_ADD64mi32, - X86::LOCK_ADD64mr, - }, - { - X86::LOCK_SUB8mi, - X86::LOCK_SUB8mr, - X86::LOCK_SUB16mi8, - X86::LOCK_SUB16mi, - X86::LOCK_SUB16mr, - X86::LOCK_SUB32mi8, - X86::LOCK_SUB32mi, - X86::LOCK_SUB32mr, - X86::LOCK_SUB64mi8, - X86::LOCK_SUB64mi32, - X86::LOCK_SUB64mr, - }, - { - 0, - X86::LOCK_INC8m, - 0, - 0, - X86::LOCK_INC16m, - 0, - 0, - X86::LOCK_INC32m, - 0, - 0, - X86::LOCK_INC64m, - }, - { - 0, - X86::LOCK_DEC8m, - 0, - 0, - X86::LOCK_DEC16m, - 0, - 0, - X86::LOCK_DEC32m, - 0, - 0, - X86::LOCK_DEC64m, - }, - { - X86::LOCK_OR8mi, - X86::LOCK_OR8mr, - X86::LOCK_OR16mi8, - X86::LOCK_OR16mi, - X86::LOCK_OR16mr, - X86::LOCK_OR32mi8, - X86::LOCK_OR32mi, - X86::LOCK_OR32mr, - X86::LOCK_OR64mi8, - X86::LOCK_OR64mi32, - X86::LOCK_OR64mr, - }, - { - X86::LOCK_AND8mi, - X86::LOCK_AND8mr, - X86::LOCK_AND16mi8, - X86::LOCK_AND16mi, - X86::LOCK_AND16mr, - X86::LOCK_AND32mi8, - X86::LOCK_AND32mi, - X86::LOCK_AND32mr, - X86::LOCK_AND64mi8, - X86::LOCK_AND64mi32, - X86::LOCK_AND64mr, - }, - { - X86::LOCK_XOR8mi, - X86::LOCK_XOR8mr, - X86::LOCK_XOR16mi8, - X86::LOCK_XOR16mi, - X86::LOCK_XOR16mr, - X86::LOCK_XOR32mi8, - X86::LOCK_XOR32mi, - X86::LOCK_XOR32mr, - X86::LOCK_XOR64mi8, - X86::LOCK_XOR64mi32, - X86::LOCK_XOR64mr, - } -}; - -// Return the target constant operand for atomic-load-op and do simple -// translations, such as from atomic-load-add to lock-sub. The return value is -// one of the following 3 cases: -// + target-constant, the operand could be supported as a target constant. -// + empty, the operand is not needed any more with the new op selected. -// + non-empty, otherwise. -static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, - SDLoc dl, - enum AtomicOpc &Op, MVT NVT, - SDValue Val, - const X86Subtarget *Subtarget) { - if (ConstantSDNode *CN = dyn_cast(Val)) { - int64_t CNVal = CN->getSExtValue(); - // Quit if not 32-bit imm. - if ((int32_t)CNVal != CNVal) - return Val; - // Quit if INT32_MIN: it would be negated as it is negative and overflow, - // producing an immediate that does not fit in the 32 bits available for - // an immediate operand to sub. However, it still fits in 32 bits for the - // add (since it is not negated) so we can return target-constant. - if (CNVal == INT32_MIN) - return CurDAG->getTargetConstant(CNVal, dl, NVT); - // For atomic-load-add, we could do some optimizations. - if (Op == ADD) { - // Translate to INC/DEC if ADD by 1 or -1. - if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) { - Op = (CNVal == 1) ? INC : DEC; - // No more constant operand after being translated into INC/DEC. - return SDValue(); - } - // Translate to SUB if ADD by negative value. - if (CNVal < 0) { - Op = SUB; - CNVal = -CNVal; - } - } - return CurDAG->getTargetConstant(CNVal, dl, NVT); - } - - // If the value operand is single-used, try to optimize it. - if (Op == ADD && Val.hasOneUse()) { - // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x). - if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) { - Op = SUB; - return Val.getOperand(1); - } - // A special case for i16, which needs truncating as, in most cases, it's - // promoted to i32. We will translate - // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x)) - if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 && - Val.getOperand(0).getOpcode() == ISD::SUB && - X86::isZeroNode(Val.getOperand(0).getOperand(0))) { - Op = SUB; - Val = Val.getOperand(0); - return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT, - Val.getOperand(1)); - } - } - - return Val; -} - -SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) { - if (Node->hasAnyUseOfValue(0)) - return nullptr; - - SDLoc dl(Node); - - // Optimize common patterns for __sync_or_and_fetch and similar arith - // operations where the result is not used. This allows us to use the "lock" - // version of the arithmetic instruction. - SDValue Chain = Node->getOperand(0); - SDValue Ptr = Node->getOperand(1); - SDValue Val = Node->getOperand(2); - SDValue Base, Scale, Index, Disp, Segment; - if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) - return nullptr; - - // Which index into the table. - enum AtomicOpc Op; - switch (Node->getOpcode()) { - default: - return nullptr; - case ISD::ATOMIC_LOAD_OR: - Op = OR; - break; - case ISD::ATOMIC_LOAD_AND: - Op = AND; - break; - case ISD::ATOMIC_LOAD_XOR: - Op = XOR; - break; - case ISD::ATOMIC_LOAD_ADD: - Op = ADD; - break; - } - - Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget); - bool isUnOp = !Val.getNode(); - bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); - - unsigned Opc = 0; - switch (NVT.SimpleTy) { - default: return nullptr; - case MVT::i8: - if (isCN) - Opc = AtomicOpcTbl[Op][ConstantI8]; - else - Opc = AtomicOpcTbl[Op][I8]; - break; - case MVT::i16: - if (isCN) { - if (immSext8(Val.getNode())) - Opc = AtomicOpcTbl[Op][SextConstantI16]; - else - Opc = AtomicOpcTbl[Op][ConstantI16]; - } else - Opc = AtomicOpcTbl[Op][I16]; - break; - case MVT::i32: - if (isCN) { - if (immSext8(Val.getNode())) - Opc = AtomicOpcTbl[Op][SextConstantI32]; - else - Opc = AtomicOpcTbl[Op][ConstantI32]; - } else - Opc = AtomicOpcTbl[Op][I32]; - break; - case MVT::i64: - if (isCN) { - if (immSext8(Val.getNode())) - Opc = AtomicOpcTbl[Op][SextConstantI64]; - else if (i64immSExt32(Val.getNode())) - Opc = AtomicOpcTbl[Op][ConstantI64]; - else - llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith"); - } else - Opc = AtomicOpcTbl[Op][I64]; - break; - } - - assert(Opc != 0 && "Invalid arith lock transform!"); - - // Building the new node. - SDValue Ret; - if (isUnOp) { - SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain }; - Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); - } else { - SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain }; - Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); - } - - // Copying the MachineMemOperand. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(Node)->getMemOperand(); - cast(Ret)->setMemRefs(MemOp, MemOp + 1); - - // We need to have two outputs as that is what the original instruction had. - // So we add a dummy, undefined output. This is safe as we checked first - // that no-one uses our output anyway. - SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, NVT), 0); - SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, dl).getNode(); -} - /// Test whether the given X86ISD::CMP node has any uses which require the SF /// or OF bits to be accurate. static bool hasNoSignedComparisonUses(SDNode *N) { @@ -2301,15 +2011,6 @@ return nullptr; } - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = selectAtomicLoadArith(Node, NVT); - if (RetVal) - return RetVal; - break; - } case ISD::AND: case ISD::OR: case ISD::XOR: { Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -514,6 +514,10 @@ LCMPXCHG8_DAG, LCMPXCHG16_DAG, + /// LOCK-prefixed arithmetic read-modify-write instructions. + /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) + LADD, LSUB, LOR, LXOR, LAND, + // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -480,6 +480,10 @@ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } @@ -20333,19 +20337,68 @@ return LowerVectorCTPOP(Op, Subtarget, DAG); } -static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - EVT T = Node->getValueType(0); - SDValue negOp = DAG.getNode(ISD::SUB, dl, T, - DAG.getConstant(0, dl, T), Node->getOperand(2)); - return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, - cast(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), negOp, - cast(Node)->getMemOperand(), - cast(Node)->getOrdering(), - cast(Node)->getSynchScope()); +static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { + unsigned NewOpc = 0; + switch (N->getOpcode()) { + case ISD::ATOMIC_LOAD_ADD: + NewOpc = X86ISD::LADD; + break; + case ISD::ATOMIC_LOAD_SUB: + NewOpc = X86ISD::LSUB; + break; + case ISD::ATOMIC_LOAD_OR: + NewOpc = X86ISD::LOR; + break; + case ISD::ATOMIC_LOAD_XOR: + NewOpc = X86ISD::LXOR; + break; + case ISD::ATOMIC_LOAD_AND: + NewOpc = X86ISD::LAND; + break; + default: + llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); + } + + MachineMemOperand *MMO = cast(N)->getMemOperand(); + return DAG.getMemIntrinsicNode( + NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); +} + +/// Lower atomic_load_ops into LOCK-prefixed operations. +static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Chain = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + unsigned Opc = N->getOpcode(); + MVT VT = N->getSimpleValueType(0); + SDLoc DL(N); + + // We can lower atomic_load_add into LXADD. However, any other atomicrmw op + // can only be lowered when the result is unused. They should have already + // been transformed into a cmpxchg loop in AtomicExpand. + if (N->hasAnyUseOfValue(0)) { + // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to + // select LXADD if LOCK_SUB can't be selected. + if (Opc == ISD::ATOMIC_LOAD_SUB) { + AtomicSDNode *AN = cast(N.getNode()); + RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, + RHS, AN->getMemOperand(), AN->getOrdering(), + AN->getSynchScope()); + } + assert(Opc == ISD::ATOMIC_LOAD_ADD && + "Used AtomicRMW ops other than Add should have been expanded!"); + return N; + } + + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG); + // RAUW the chain, but don't worry about the result, as it's unused. + assert(!N->hasAnyUseOfValue(0)); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); + return SDValue(); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { @@ -20767,7 +20820,11 @@ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); - case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -21221,6 +21278,11 @@ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; + case X86ISD::LADD: return "X86ISD::LADD"; + case X86ISD::LSUB: return "X86ISD::LSUB"; + case X86ISD::LOR: return "X86ISD::LOR"; + case X86ISD::LXOR: return "X86ISD::LXOR"; + case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -28861,6 +28923,26 @@ return SDValue(); } +// Canonicalize (LSUB p, 1) -> (LADD p, -1). +static SDValue performLSUBCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Chain = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + MVT VT = RHS.getSimpleValueType(); + SDLoc DL(N); + + auto *C = dyn_cast(RHS); + if (!C || C->getZExtValue() != 1) + return SDValue(); + + RHS = DAG.getConstant(-1, DL, VT); + MachineMemOperand *MMO = cast(N)->getMemOperand(); + return DAG.getMemIntrinsicNode(X86ISD::LADD, DL, + DAG.getVTList(MVT::i32, MVT::Other), + {Chain, LHS, RHS}, VT, MMO); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -28937,6 +29019,7 @@ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); + case X86ISD::LSUB: return performLSUBCombine(N, DAG, Subtarget); } return SDValue(); Index: llvm/trunk/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrCompiler.td +++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td @@ -568,7 +568,7 @@ // ImmOpc8 corresponds to the mi8 version of the instruction // ImmMod corresponds to the instruction format of the mi and mi8 versions multiclass LOCK_ArithBinOp RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, - Format ImmMod, string mnemonic> { + Format ImmMod, SDPatternOperator Op, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { @@ -577,106 +577,124 @@ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), !strconcat(mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, LOCK; + [(set EFLAGS, (Op addr:$dst, GR8:$src2))], + IIC_ALU_NONMEM>, LOCK; + def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, OpSize16, LOCK; + [(set EFLAGS, (Op addr:$dst, GR16:$src2))], + IIC_ALU_NONMEM>, OpSize16, LOCK; + def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, OpSize32, LOCK; + [(set EFLAGS, (Op addr:$dst, GR32:$src2))], + IIC_ALU_NONMEM>, OpSize32, LOCK; + def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, LOCK; + [(set EFLAGS, (Op addr:$dst, GR64:$src2))], + IIC_ALU_NONMEM>, LOCK; def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), !strconcat(mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; + [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))], + IIC_ALU_MEM>, LOCK; def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, OpSize16, LOCK; + [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))], + IIC_ALU_MEM>, OpSize16, LOCK; def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, OpSize32, LOCK; + [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))], + IIC_ALU_MEM>, OpSize32, LOCK; def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; + [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))], + IIC_ALU_MEM>, LOCK; def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, OpSize16, LOCK; + [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))], + IIC_ALU_MEM>, OpSize16, LOCK; + def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, OpSize32, LOCK; + [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))], + IIC_ALU_MEM>, OpSize32, LOCK; + def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; + [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))], + IIC_ALU_MEM>, LOCK; } } -defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; -defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; -defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; -defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; -defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">; -// Optimized codegen when the non-memory output is not used. multiclass LOCK_ArithUnOp Opc8, bits<8> Opc, Format Form, - string mnemonic> { + int Increment, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, - SchedRW = [WriteALULd, WriteRMW] in { - + SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in { def NAME#8m : I, LOCK; + [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))], + IIC_UNARY_MEM>, LOCK; def NAME#16m : I, OpSize16, LOCK; + [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))], + IIC_UNARY_MEM>, OpSize16, LOCK; def NAME#32m : I, OpSize32, LOCK; + [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))], + IIC_UNARY_MEM>, OpSize32, LOCK; def NAME#64m : RI, LOCK; + [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))], + IIC_UNARY_MEM>, LOCK; } } -defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; -defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, 1, "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">; // Atomic compare and swap. multiclass LCMPXCHG_UnOp Opc, Format Form, string mnemonic, Index: llvm/trunk/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.td +++ llvm/trunk/lib/Target/X86/X86InstrInfo.td @@ -72,6 +72,10 @@ SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>, + SDTCisInt<2>]>; + def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>; def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; @@ -235,6 +239,22 @@ def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; +def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;