Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -26134,6 +26134,56 @@ return SDValue(); } +/// Combine: +/// (brcond/cmov/setcc .., (cmp (atomic_load_op ..), 0), cc) +/// to: +/// (brcond/cmov/setcc .., (LOCKed op ..), cc) +/// i.e., reusing the EFLAGS produced by the LOCKed instruction. +/// Note that this is only legal for some op/cc combinations. +static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode CC, + SelectionDAG &DAG) { + // This combine only operates on CMP-like nodes. + if (!(Cmp.getOpcode() == X86ISD::CMP || + (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) + return SDValue(); + + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + + if (!LHS.hasOneUse()) + return SDValue(); + + // FIXME: We can do this for XOR/OR/AND as well, but only if they survive + // AtomicExpand. Currently, we choose to expand them to cmpxchg if they + // have any users. Could we relax that to ignore (icmp x,0) users? + switch (LHS->getOpcode()) { + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + break; + default: + return SDValue(); + } + + auto *C = dyn_cast(RHS); + if (!C || C->getZExtValue() != 0) + return SDValue(); + + // Don't do this for all condition codes, as OF/CF are cleared by (CMP x,0) + // but might be set by arithmetic. Furthermore, we might later select INC/DEC, + // which don't modify CF (though CCs using CF should have been optimized out). + // SF/ZF are safe as they are set the same way. + // Note that in theory, the transformation is also valid for P/NP. + if (CC != X86::COND_E && CC != X86::COND_NE && CC != X86::COND_S && + CC != X86::COND_NS) + return SDValue(); + + SDValue LockOp = lowerAtomicArithWithLOCK(LHS, DAG); + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(0), + DAG.getUNDEF(LHS.getValueType())); + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LockOp.getValue(1)); + return LockOp; +} + // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. @@ -26305,6 +26355,16 @@ return true; } +/// Optimize an EFLAGS definition used according to the condition code \p CC +/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing +/// uses of chain values. +static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG) { + if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) + return R; + return combineSetCCAtomicArith(EFLAGS, CC, DAG); +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -26331,15 +26391,14 @@ } } - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(Cond, CC); - if (Flags.getNode() && - // Extra check as FCMOV only supports a subset of X86 cond. - (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { - SDValue Ops[] = { FalseOp, TrueOp, - DAG.getConstant(CC, DL, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + // Try to simplify the EFLAGS and condition code operands. + // We can't always do this as FCMOV only supports a subset of X86 cond. + if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) { + if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { + SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), + Flags}; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + } } // If this is a select between two integer constants, try to do some @@ -29265,7 +29324,8 @@ if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + // Try to simplify the EFLAGS and condition code operands. + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } @@ -29278,15 +29338,16 @@ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); - if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + // Try to simplify the EFLAGS and condition code operands. + // Make sure to not keep references to operands, as combineSetCCEFLAGS can + // RAUW them under us. + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); - return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, - Flags); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), + N->getOperand(1), Cond, Flags); } return SDValue(); Index: llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll +++ llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll @@ -4,9 +4,7 @@ define i8 @test_add_1_setcc_ne(i64* %p) #0 { ; CHECK-LABEL: test_add_1_setcc_ne: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: lock xaddq %rax, (%rdi) -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: lock incq (%rdi) ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq entry: @@ -19,9 +17,7 @@ define i8 @test_sub_1_setcc_eq(i64* %p) #0 { ; CHECK-LABEL: test_sub_1_setcc_eq: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: lock decq (%rdi) ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq entry: @@ -49,9 +45,7 @@ define i8 @test_sub_10_setcc_sge(i64* %p) #0 { ; CHECK-LABEL: test_sub_10_setcc_sge: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq $-10, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: lock addq $-10, (%rdi) ; CHECK-NEXT: setns %al ; CHECK-NEXT: retq entry: @@ -66,9 +60,7 @@ define i32 @test_add_10_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 { ; CHECK-LABEL: test_add_10_brcond_sge: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $10, %eax -; CHECK-NEXT: lock xaddq %rax, (%rdi) -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: lock addq $10, (%rdi) ; CHECK-NEXT: js .LBB4_2 ; CHECK-NEXT: # BB#1: # %t ; CHECK-NEXT: movl %esi, %eax @@ -89,9 +81,7 @@ define i32 @test_sub_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 { ; CHECK-LABEL: test_sub_1_cmov_slt: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) -; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: lock decq (%rdi) ; CHECK-NEXT: cmovnsl %edx, %esi ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq