diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -258,6 +258,8 @@ MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop. BitTestIntrinsic, // Use a target-specific intrinsic for special bit // operations; used by X86. + CmpArithIntrinsic,// Use a target-specific intrinsic for special compare + // operations; used by X86. Expand, // Generic expansion in terms of other atomic operations. // Rewrite to a non-atomic form for use in a known non-preemptible @@ -2018,6 +2020,14 @@ "Bit test atomicrmw expansion unimplemented on this target"); } + /// Perform a atomicrmw which the result is only used by comparison, using a + /// target-specific intrinsic. This represents the combined atomic and compare + /// intrinsic which will be lowered at a late stage by the backend. + virtual void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const { + llvm_unreachable( + "Compare arith atomicrmw expansion unimplemented on this target"); + } + /// Perform a masked cmpxchg using a target-specific intrinsic. This /// represents the core LL/SC loop which will be lowered at a late stage by /// the backend. The target-specific intrinsic returns the loaded value and diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -72,6 +72,14 @@ [ImmArg>]>; } +// Lock binary arith with CC. +let TargetPrefix = "x86" in { + def int_x86_atomic_add_cc : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty, llvm_i32_ty], + [ImmArg>]>; + def int_x86_atomic_sub_cc : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty, llvm_i32_ty], + [ImmArg>]>; +} + // Read Processor Register. let TargetPrefix = "x86" in { def int_x86_rdpru : ClangBuiltin<"__builtin_ia32_rdpru">, diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -604,6 +604,10 @@ TLI->emitBitTestAtomicRMWIntrinsic(AI); return true; } + case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: { + TLI->emitCmpArithAtomicRMWIntrinsic(AI); + return true; + } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); case TargetLoweringBase::AtomicExpansionKind::Expand: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1682,6 +1682,7 @@ TargetLoweringBase::AtomicExpansionKind shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; + void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5659,7 +5659,9 @@ case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: case Intrinsic::x86_axor32: - case Intrinsic::x86_axor64: { + case Intrinsic::x86_axor64: + case Intrinsic::x86_atomic_add_cc: + case Intrinsic::x86_atomic_sub_cc: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); @@ -28382,6 +28384,32 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), {Chain, Op1, Op2}, VT, MMO); } + case Intrinsic::x86_atomic_add_cc: + case Intrinsic::x86_atomic_sub_cc: { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + SDValue Op1 = Op.getOperand(2); + SDValue Op2 = Op.getOperand(3); + X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4); + MVT VT = Op2.getSimpleValueType(); + unsigned Opc = 0; + switch (IntNo) { + default: + llvm_unreachable("Unknown Intrinsic"); + case Intrinsic::x86_atomic_add_cc: + Opc = X86ISD::LADD; + break; + case Intrinsic::x86_atomic_sub_cc: + Opc = X86ISD::LSUB; + break; + } + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + SDValue LockArith = + DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), + {Chain, Op1, Op2}, VT, MMO); + Chain = LockArith.getValue(1); + return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL); + } } return SDValue(); } @@ -31364,6 +31392,74 @@ AI->eraseFromParent(); } +static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) { + using namespace llvm::PatternMatch; + if (!AI->hasOneUse()) + return false; + + Value *Op = AI->getOperand(1); + ICmpInst::Predicate Pred; + Instruction *I = AI->user_back(); + AtomicRMWInst::BinOp Opc = AI->getOperation(); + if (Opc == AtomicRMWInst::Add) { + if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value()))) + return Pred == CmpInst::ICMP_EQ; + if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value()))) && + match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) + return Pred == CmpInst::ICMP_SLT; + return false; + } + if (Opc == AtomicRMWInst::Sub) { + if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) + return Pred == CmpInst::ICMP_EQ; + if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op)))) && + match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) + return Pred == CmpInst::ICMP_SLT; + return false; + } + + return false; +} + +void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( + AtomicRMWInst *AI) const { + IRBuilder<> Builder(AI); + Instruction *TempI = nullptr; + LLVMContext &Ctx = AI->getContext(); + ICmpInst *ICI = dyn_cast(AI->user_back()); + if (!ICI) { + TempI = AI->user_back(); + assert(TempI->hasOneUse() && "Must have one use"); + ICI = cast(TempI->user_back()); + } + ICmpInst::Predicate Pred = ICI->getPredicate(); + assert((Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLT) && + "Not supported Pred"); + X86::CondCode CC = Pred == CmpInst::ICMP_EQ ? X86::COND_E : X86::COND_S; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + switch (AI->getOperation()) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Add: + IID = Intrinsic::x86_atomic_add_cc; + break; + case AtomicRMWInst::Sub: + IID = Intrinsic::x86_atomic_sub_cc; + break; + } + Function *CmpArith = + Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); + Value *Call = Builder.CreateCall(CmpArith, {AI->getPointerOperand(), + AI->getValOperand(), + Builder.getInt32((unsigned)CC)}); + Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx)); + ICI->replaceAllUsesWith(Result); + ICI->eraseFromParent(); + if (TempI) + TempI->eraseFromParent(); + AI->eraseFromParent(); +} + TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; @@ -31381,9 +31477,12 @@ default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: + return AtomicExpansionKind::None; case AtomicRMWInst::Add: case AtomicRMWInst::Sub: - // It's better to use xadd, xsub or xchg for these in all cases. + if (shouldExpandCmpArithRMWInIR(AI)) + return AtomicExpansionKind::CmpArithIntrinsic; + // It's better to use xadd, xsub or xchg for these in other cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: diff --git a/llvm/test/CodeGen/X86/pr37025.ll b/llvm/test/CodeGen/X86/pr37025.ll --- a/llvm/test/CodeGen/X86/pr37025.ll +++ b/llvm/test/CodeGen/X86/pr37025.ll @@ -43,13 +43,13 @@ define void @test_dec_select_commute(ptr nocapture %0, ptr readnone %1) { ; CHECK-LABEL: test_dec_select_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) +; CHECK-NEXT: lock decq (%rdi) +; CHECK-NEXT: sete %al ; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: je .LBB1_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: cmpq $1, %rax -; CHECK-NEXT: jne .LBB1_2 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB1_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: jmp func2 # TAILCALL ; CHECK-NEXT: .LBB1_2: @@ -71,13 +71,13 @@ define void @test_dec_and(ptr nocapture %0, ptr readnone %1) { ; CHECK-LABEL: test_dec_and: ; CHECK: # %bb.0: -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) +; CHECK-NEXT: lock decq (%rdi) +; CHECK-NEXT: sete %al ; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: cmpq $1, %rax -; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: jmp func2 # TAILCALL ; CHECK-NEXT: .LBB2_2: diff --git a/llvm/test/CodeGen/X86/pr58685.ll b/llvm/test/CodeGen/X86/pr58685.ll --- a/llvm/test/CodeGen/X86/pr58685.ll +++ b/llvm/test/CodeGen/X86/pr58685.ll @@ -4,9 +4,7 @@ define i1 @lock_add_sete(ptr %0, i32 %1) nounwind { ; CHECK-LABEL: lock_add_sete: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: lock xaddl %eax, (%rdi) -; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: lock addl %esi, (%rdi) ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %3 = atomicrmw add ptr %0, i32 %1 seq_cst, align 4 @@ -18,11 +16,8 @@ define i1 @lock_add_sets(ptr %0, i32 %1) nounwind { ; CHECK-LABEL: lock_add_sets: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: lock xaddl %eax, (%rdi) -; CHECK-NEXT: addl %esi, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: lock addl %esi, (%rdi) +; CHECK-NEXT: sets %al ; CHECK-NEXT: retq %3 = atomicrmw add ptr %0, i32 %1 seq_cst, align 4 %4 = add i32 %3, %1 @@ -33,10 +28,7 @@ define i1 @lock_sub_sete(ptr %0, i32 %1) nounwind { ; CHECK-LABEL: lock_sub_sete: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: lock xaddl %eax, (%rdi) -; CHECK-NEXT: cmpl %esi, %eax +; CHECK-NEXT: lock subl %esi, (%rdi) ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %3 = atomicrmw sub ptr %0, i32 %1 seq_cst, align 4 @@ -47,12 +39,8 @@ define i1 @lock_sub_sets(ptr %0, i32 %1) nounwind { ; CHECK-LABEL: lock_sub_sets: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: lock xaddl %eax, (%rdi) -; CHECK-NEXT: subl %esi, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: lock subl %esi, (%rdi) +; CHECK-NEXT: sets %al ; CHECK-NEXT: retq %3 = atomicrmw sub ptr %0, i32 %1 seq_cst, align 4 %4 = sub i32 %3, %1