Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -569,7 +569,7 @@ /// LOCK-prefixed arithmetic read-modify-write instructions. /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) - LADD, LSUB, LOR, LXOR, LAND, + LADD, LSUB, LOR, LXOR, LAND, LINC, LDEC, // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23575,7 +23575,9 @@ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } -static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { +static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool AllowIncDec = true) { unsigned NewOpc = 0; switch (N->getOpcode()) { case ISD::ATOMIC_LOAD_ADD: @@ -23598,6 +23600,26 @@ } MachineMemOperand *MMO = cast(N)->getMemOperand(); + + if (auto *C = dyn_cast(N->getOperand(2))) { + // Convert to inc/dec if they aren't slow or we are optimizing for size. + if (AllowIncDec && (!Subtarget.slowIncDec() || + DAG.getMachineFunction().getFunction()->optForSize())) { + if ((NewOpc == X86ISD::LADD && C->isOne()) || + (NewOpc == X86ISD::LSUB && C->isAllOnesValue())) + return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N), + DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); + if ((NewOpc == X86ISD::LSUB && C->isOne()) || + (NewOpc == X86ISD::LADD && C->isAllOnesValue())) + return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N), + DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); + } + } + return DAG.getMemIntrinsicNode( NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, @@ -23631,7 +23653,7 @@ return N; } - SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG); + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); @@ -24709,6 +24731,8 @@ case X86ISD::LOR: return "X86ISD::LOR"; case X86ISD::LXOR: return "X86ISD::LXOR"; case X86ISD::LAND: return "X86ISD::LAND"; + case X86ISD::LINC: return "X86ISD::LINC"; + case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -31008,7 +31032,8 @@ /// i.e., reusing the EFLAGS produced by the LOCKed instruction. /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) @@ -31068,7 +31093,16 @@ /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()), AN->getMemOperand()); - auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG); + // If the comparision uses the CF flag we can't use INC/DEC instructions. + bool NeedCF = false; + switch (CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + } + auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); @@ -31091,7 +31125,7 @@ else return SDValue(); - SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); + SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); @@ -31298,14 +31332,15 @@ /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (CC == X86::COND_B) if (SDValue Flags = combineCarryThroughADD(EFLAGS)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; - return combineSetCCAtomicArith(EFLAGS, CC, DAG); + return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] @@ -31332,7 +31367,7 @@ // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. - if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) { + if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), Flags}; @@ -35478,7 +35513,7 @@ SDValue EFLAGS = N->getOperand(1); // Try to simplify the EFLAGS and condition code operands. - if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) return getSETCC(CC, Flags, DL, DAG); return SDValue(); @@ -35494,7 +35529,7 @@ // Try to simplify the EFLAGS and condition code operands. // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. - if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); Index: lib/Target/X86/X86InstrCompiler.td =================================================================== --- lib/Target/X86/X86InstrCompiler.td +++ lib/Target/X86/X86InstrCompiler.td @@ -696,33 +696,53 @@ defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">; multiclass LOCK_ArithUnOp Opc8, bits<8> Opc, Format Form, - SDNode Op, string mnemonic> { + string frag, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { def NAME#8m : I(frag # "_8") addr:$dst))], IIC_UNARY_MEM>, LOCK; def NAME#16m : I(frag # "_16") addr:$dst))], IIC_UNARY_MEM>, OpSize16, LOCK; def NAME#32m : I(frag # "_32") addr:$dst))], IIC_UNARY_MEM>, OpSize32, LOCK; def NAME#64m : RI(frag # "_64") addr:$dst))], IIC_UNARY_MEM>, LOCK; } } -let Predicates = [UseIncDec] in { -defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, X86lock_add, "inc">; -defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, X86lock_sub, "dec">; +multiclass unary_atomic_intrin { + def _8 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; + }]>; + def _16 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; + }]>; + def _32 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; + }]>; + def _64 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; + }]>; } +defm X86lock_inc : unary_atomic_intrin; +defm X86lock_dec : unary_atomic_intrin; + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "X86lock_inc", "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">; + // Atomic compare and swap. multiclass LCMPXCHG_UnOp Opc, Format Form, string mnemonic, SDPatternOperator frag, X86MemOperand x86memop, Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -82,6 +82,9 @@ SDTCisPtrTy<1>, SDTCisInt<2>]>; +def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>]>; + def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, @@ -271,6 +274,13 @@ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, Index: test/CodeGen/X86/atomic-eflags-reuse.ll =================================================================== --- test/CodeGen/X86/atomic-eflags-reuse.ll +++ test/CodeGen/X86/atomic-eflags-reuse.ll @@ -45,12 +45,19 @@ } define i32 @test_sub_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 { -; CHECK-LABEL: test_sub_1_cmov_sle: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: lock addq $-1, (%rdi) -; CHECK-NEXT: cmovgel %edx, %esi -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: retq +; FASTINCDEC-LABEL: test_sub_1_cmov_sle: +; FASTINCDEC: # BB#0: # %entry +; FASTINCDEC-NEXT: lock decq (%rdi) +; FASTINCDEC-NEXT: cmovgel %edx, %esi +; FASTINCDEC-NEXT: movl %esi, %eax +; FASTINCDEC-NEXT: retq +; +; SLOWINCDEC-LABEL: test_sub_1_cmov_sle: +; SLOWINCDEC: # BB#0: # %entry +; SLOWINCDEC-NEXT: lock addq $-1, (%rdi) +; SLOWINCDEC-NEXT: cmovgel %edx, %esi +; SLOWINCDEC-NEXT: movl %esi, %eax +; SLOWINCDEC-NEXT: retq entry: %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst %tmp1 = icmp sle i64 %tmp0, 0 @@ -59,12 +66,19 @@ } define i32 @test_sub_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 { -; CHECK-LABEL: test_sub_1_cmov_sgt: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: lock addq $-1, (%rdi) -; CHECK-NEXT: cmovll %edx, %esi -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: retq +; FASTINCDEC-LABEL: test_sub_1_cmov_sgt: +; FASTINCDEC: # BB#0: # %entry +; FASTINCDEC-NEXT: lock decq (%rdi) +; FASTINCDEC-NEXT: cmovll %edx, %esi +; FASTINCDEC-NEXT: movl %esi, %eax +; FASTINCDEC-NEXT: retq +; +; SLOWINCDEC-LABEL: test_sub_1_cmov_sgt: +; SLOWINCDEC: # BB#0: # %entry +; SLOWINCDEC-NEXT: lock addq $-1, (%rdi) +; SLOWINCDEC-NEXT: cmovll %edx, %esi +; SLOWINCDEC-NEXT: movl %esi, %eax +; SLOWINCDEC-NEXT: retq entry: %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst %tmp1 = icmp sgt i64 %tmp0, 0 @@ -89,11 +103,17 @@ } define i8 @test_sub_1_setcc_sgt(i64* %p) #0 { -; CHECK-LABEL: test_sub_1_setcc_sgt: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: lock addq $-1, (%rdi) -; CHECK-NEXT: setge %al -; CHECK-NEXT: retq +; FASTINCDEC-LABEL: test_sub_1_setcc_sgt: +; FASTINCDEC: # BB#0: # %entry +; FASTINCDEC-NEXT: lock decq (%rdi) +; FASTINCDEC-NEXT: setge %al +; FASTINCDEC-NEXT: retq +; +; SLOWINCDEC-LABEL: test_sub_1_setcc_sgt: +; SLOWINCDEC: # BB#0: # %entry +; SLOWINCDEC-NEXT: lock addq $-1, (%rdi) +; SLOWINCDEC-NEXT: setge %al +; SLOWINCDEC-NEXT: retq entry: %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst %tmp1 = icmp sgt i64 %tmp0, 0 @@ -257,17 +277,11 @@ } define i8 @test_sub_1_cmp_1_setcc_ugt(i64* %p) #0 { -; FASTINCDEC-LABEL: test_sub_1_cmp_1_setcc_ugt: -; FASTINCDEC: # BB#0: # %entry -; FASTINCDEC-NEXT: lock decq (%rdi) -; FASTINCDEC-NEXT: seta %al -; FASTINCDEC-NEXT: retq -; -; SLOWINCDEC-LABEL: test_sub_1_cmp_1_setcc_ugt: -; SLOWINCDEC: # BB#0: # %entry -; SLOWINCDEC-NEXT: lock subq $1, (%rdi) -; SLOWINCDEC-NEXT: seta %al -; SLOWINCDEC-NEXT: retq +; CHECK-LABEL: test_sub_1_cmp_1_setcc_ugt: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lock subq $1, (%rdi) +; CHECK-NEXT: seta %al +; CHECK-NEXT: retq entry: %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst %tmp1 = icmp ugt i64 %tmp0, 1