diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -257,6 +257,8 @@ MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop. BitTestIntrinsic, // Use a target-specific intrinsic for special bit // operations; used by X86. + CmpArithIntrinsic,// Use a target-specific intrinsic for special compare + // operations; used by X86. Expand, // Generic expansion in terms of other atomic operations. // Rewrite to a non-atomic form for use in a known non-preemptible @@ -2016,6 +2018,14 @@ "Bit test atomicrmw expansion unimplemented on this target"); } + /// Perform a atomicrmw which the result is only used by comparison, using a + /// target-specific intrinsic. This represents the combined atomic and compare + /// intrinsic which will be lowered at a late stage by the backend. + virtual void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const { + llvm_unreachable( + "Compare arith atomicrmw expansion unimplemented on this target"); + } + /// Perform a masked cmpxchg using a target-specific intrinsic. This /// represents the core LL/SC loop which will be lowered at a late stage by /// the backend. The target-specific intrinsic returns the loaded value and diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -72,6 +72,14 @@ [ImmArg>]>; } +// Lock binary arith with CC. +let TargetPrefix = "x86" in { + def int_x86_atomic_add_cc : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty, llvm_i32_ty], + [ImmArg>]>; + def int_x86_atomic_sub_cc : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty, llvm_i32_ty], + [ImmArg>]>; +} + // Read Processor Register. let TargetPrefix = "x86" in { def int_x86_rdpru : ClangBuiltin<"__builtin_ia32_rdpru">, diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -608,6 +608,10 @@ TLI->emitBitTestAtomicRMWIntrinsic(AI); return true; } + case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: { + TLI->emitCmpArithAtomicRMWIntrinsic(AI); + return true; + } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); case TargetLoweringBase::AtomicExpansionKind::Expand: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1682,6 +1682,7 @@ TargetLoweringBase::AtomicExpansionKind shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; + void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5659,7 +5659,9 @@ case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: case Intrinsic::x86_axor32: - case Intrinsic::x86_axor64: { + case Intrinsic::x86_axor64: + case Intrinsic::x86_atomic_add_cc: + case Intrinsic::x86_atomic_sub_cc: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); @@ -28315,6 +28317,32 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), {Chain, Op1, Op2}, VT, MMO); } + case Intrinsic::x86_atomic_add_cc: + case Intrinsic::x86_atomic_sub_cc: { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + SDValue Op1 = Op.getOperand(2); + SDValue Op2 = Op.getOperand(3); + X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4); + MVT VT = Op2.getSimpleValueType(); + unsigned Opc = 0; + switch (IntNo) { + default: + llvm_unreachable("Unknown Intrinsic"); + case Intrinsic::x86_atomic_add_cc: + Opc = X86ISD::LADD; + break; + case Intrinsic::x86_atomic_sub_cc: + Opc = X86ISD::LSUB; + break; + } + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + SDValue LockArith = + DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), + {Chain, Op1, Op2}, VT, MMO); + Chain = LockArith.getValue(1); + return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL); + } } return SDValue(); } @@ -31297,6 +31325,71 @@ AI->eraseFromParent(); } +static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) { + using namespace llvm::PatternMatch; + if (!AI->hasOneUse()) + return false; + + Value *Op = AI->getOperand(1); + ICmpInst::Predicate Pred; + Instruction *I = AI->user_back(); + AtomicRMWInst::BinOp Opc = AI->getOperation(); + if (Opc == AtomicRMWInst::Add) { + if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value()))) + return Pred == CmpInst::ICMP_EQ; + if (match(I, m_c_Add(m_Specific(Op), m_Value())) && I->hasOneUse() && + match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) + return Pred == CmpInst::ICMP_SLT; + return false; + } + if (Opc == AtomicRMWInst::Sub) { + if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) + return Pred == CmpInst::ICMP_EQ; + if (match(I, m_Sub(m_Value(), m_Specific(Op))) && I->hasOneUse() && + match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) + return Pred == CmpInst::ICMP_SLT; + return false; + } + + return false; +} + +void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( + AtomicRMWInst *AI) const { + IRBuilder<> Builder(AI); + Instruction *TempI = nullptr; + LLVMContext &Ctx = AI->getContext(); + ICmpInst *ICI = dyn_cast(AI->user_back()); + if (!ICI) { + TempI = AI->user_back(); + ICI = cast(TempI->user_back()); + } + X86::CondCode CC = + ICI->getPredicate() == CmpInst::ICMP_EQ ? X86::COND_E : X86::COND_S; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + switch (AI->getOperation()) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Add: + IID = Intrinsic::x86_atomic_add_cc; + break; + case AtomicRMWInst::Sub: + IID = Intrinsic::x86_atomic_sub_cc; + break; + } + Function *CmpArith = + Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); + Value *Call = Builder.CreateCall(CmpArith, {AI->getPointerOperand(), + AI->getValOperand(), + Builder.getInt32((unsigned)CC)}); + Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx)); + ICI->replaceAllUsesWith(Result); + ICI->eraseFromParent(); + if (TempI) + TempI->eraseFromParent(); + AI->eraseFromParent(); +} + TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; @@ -31314,9 +31407,12 @@ default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: + return AtomicExpansionKind::None; case AtomicRMWInst::Add: case AtomicRMWInst::Sub: - // It's better to use xadd, xsub or xchg for these in all cases. + if (shouldExpandCmpArithRMWInIR(AI)) + return AtomicExpansionKind::CmpArithIntrinsic; + // It's better to use xadd, xsub or xchg for these in other cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: diff --git a/llvm/test/CodeGen/X86/pr37025.ll b/llvm/test/CodeGen/X86/pr37025.ll --- a/llvm/test/CodeGen/X86/pr37025.ll +++ b/llvm/test/CodeGen/X86/pr37025.ll @@ -43,13 +43,13 @@ define void @test_dec_select_commute(ptr nocapture %0, ptr readnone %1) { ; CHECK-LABEL: test_dec_select_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) +; CHECK-NEXT: lock decq (%rdi) +; CHECK-NEXT: sete %al ; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: je .LBB1_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: cmpq $1, %rax -; CHECK-NEXT: jne .LBB1_2 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB1_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: jmp func2 # TAILCALL ; CHECK-NEXT: .LBB1_2: @@ -71,13 +71,13 @@ define void @test_dec_and(ptr nocapture %0, ptr readnone %1) { ; CHECK-LABEL: test_dec_and: ; CHECK: # %bb.0: -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: lock xaddq %rax, (%rdi) +; CHECK-NEXT: lock decq (%rdi) +; CHECK-NEXT: sete %al ; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: cmpq $1, %rax -; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: jmp func2 # TAILCALL ; CHECK-NEXT: .LBB2_2: diff --git a/llvm/test/CodeGen/X86/pr58685.ll b/llvm/test/CodeGen/X86/pr58685.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr58685.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define dso_local noundef zeroext i1 @_Z13lock_add_seteRSt6atomicIjEj(ptr nocapture noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) local_unnamed_addr #0 { +; CHECK-LABEL: _Z13lock_add_seteRSt6atomicIjEj: +; CHECK: # %bb.0: +; CHECK-NEXT: lock addl %esi, (%rdi) +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq + %3 = atomicrmw add ptr %0, i32 %1 seq_cst, align 4 + %4 = sub i32 0, %1 + %5 = icmp eq i32 %3, %4 + ret i1 %5 +} + +define dso_local noundef zeroext i1 @_Z13lock_add_setsRSt6atomicIjEj(ptr nocapture noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) local_unnamed_addr #0 { +; CHECK-LABEL: _Z13lock_add_setsRSt6atomicIjEj: +; CHECK: # %bb.0: +; CHECK-NEXT: lock addl %esi, (%rdi) +; CHECK-NEXT: sets %al +; CHECK-NEXT: retq + %3 = atomicrmw add ptr %0, i32 %1 seq_cst, align 4 + %4 = add i32 %3, %1 + %5 = icmp slt i32 %4, 0 + ret i1 %5 +} + +define dso_local noundef zeroext i1 @_Z13lock_sub_seteRSt6atomicIjEj(ptr nocapture noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) local_unnamed_addr #0 { +; CHECK-LABEL: _Z13lock_sub_seteRSt6atomicIjEj: +; CHECK: # %bb.0: +; CHECK-NEXT: lock subl %esi, (%rdi) +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq + %3 = atomicrmw sub ptr %0, i32 %1 seq_cst, align 4 + %4 = icmp eq i32 %3, %1 + ret i1 %4 +} + +define dso_local noundef zeroext i1 @_Z13lock_sub_setsRSt6atomicIjEj(ptr nocapture noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) local_unnamed_addr #0 { +; CHECK-LABEL: _Z13lock_sub_setsRSt6atomicIjEj: +; CHECK: # %bb.0: +; CHECK-NEXT: lock subl %esi, (%rdi) +; CHECK-NEXT: sets %al +; CHECK-NEXT: retq + %3 = atomicrmw sub ptr %0, i32 %1 seq_cst, align 4 + %4 = sub i32 %3, %1 + %5 = icmp slt i32 %4, 0 + ret i1 %5 +} + +attributes #0 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }