diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -70,6 +70,14 @@ [ImmArg>]>; def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty], [ImmArg>]>; + def int_x86_atomic_bts_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty], + []>; + def int_x86_atomic_btc_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty], + []>; + def int_x86_atomic_btr_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty], + []>; + + } // Lock binary arith with CC. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -798,6 +798,9 @@ LBTS, LBTC, LBTR, + LBTS_RM, + LBTC_RM, + LBTR_RM, /// RAO arithmetic instructions. /// OUTCHAIN = AADD(INCHAIN, PTR, RHS) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5654,6 +5654,18 @@ MachineMemOperand::MOVolatile; return true; } + case Intrinsic::x86_atomic_bts_rm: + case Intrinsic::x86_atomic_btc_rm: + case Intrinsic::x86_atomic_btr_rm: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(0); + unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); + Info.align = Align(Size); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::x86_aadd32: case Intrinsic::x86_aadd64: case Intrinsic::x86_aand32: @@ -28364,6 +28376,25 @@ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_atomic_bts_rm: + case Intrinsic::x86_atomic_btc_rm: + case Intrinsic::x86_atomic_btr_rm: { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue Chain = Op.getOperand(0); + SDValue Op1 = Op.getOperand(2); + SDValue Op2 = Op.getOperand(3); + unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM + : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM + : X86ISD::LBTR_RM; + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + SDValue Res = + DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), + {Chain, Op1, Op2}, VT, MMO); + Chain = Res.getValue(1); + Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); + } case Intrinsic::x86_atomic_bts: case Intrinsic::x86_atomic_btc: case Intrinsic::x86_atomic_btr: { @@ -31401,6 +31432,75 @@ : AtomicExpansionKind::None; } +enum BitTestKind : unsigned { + UndefBit, + ConstantBit, + NotConstantBit, + ShiftBit, + NotShiftBit +}; + +static std::pair FindSingleBitChange(Value *V) { + using namespace llvm::PatternMatch; + BitTestKind BTK = UndefBit; + auto *C = dyn_cast(V); + if (C) { + // Check if V is a power of 2 or NOT power of 2. + if (isPowerOf2_64(C->getZExtValue())) + BTK = ConstantBit; + else if (isPowerOf2_64((~C->getValue()).getZExtValue())) + BTK = NotConstantBit; + return {V, BTK}; + } + + // Check if V is some power of 2 pattern known to be non-zero + auto *I = dyn_cast(V); + if (I) { + bool Not = false; + // Check if we have a NOT + Value *PeekI; + if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) || + match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) { + Not = true; + I = dyn_cast(PeekI); + assert(I != nullptr); + } + // We can only use 1 << X without more sophisticated analysis. C << X where + // C is a power of 2 but not 1 can result in zero which cannot be translated + // to bittest. Likewise any C >> X (either arith or logical) can be zero. + if (I->getOpcode() == Instruction::Shl) { + // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X & + // -X` and some other provable power of 2 patterns that we can use CTZ on + // may be profitable. + // Todo(2): It may be possible in some cases to prove that Shl(C, X) is + // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also + // be provably a non-zero power of 2. + // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be + // transformable to bittest. + auto *ShiftVal = dyn_cast(I->getOperand(0)); + if (!ShiftVal) + return {nullptr, UndefBit}; + if (ShiftVal->equalsInt(1)) + BTK = Not ? NotShiftBit : ShiftBit; + + if (BTK == UndefBit) + return {nullptr, UndefBit}; + + Value *BitV = I->getOperand(1); + + Value *AndOp; + const APInt *AndC; + if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) { + // Read past a shiftmask instruction to find count + if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1)) + BitV = AndOp; + } + return {BitV, BTK}; + } + } + return {nullptr, UndefBit}; +} + TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const { // If the atomicrmw's result isn't actually used, we can just add a "lock" @@ -31410,51 +31510,138 @@ // If the atomicrmw's result is used by a single bit AND, we may use // bts/btr/btc instruction for these operations. - auto *C1 = dyn_cast(AI->getValOperand()); + // Note: InstCombinePass can cause a de-optimization here. It replaces the + // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor + // (depending on CC). This pattern can only use bts/btr/btc but we don't + // detect it. Instruction *I = AI->user_back(); - if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And || + auto BitChange = FindSingleBitChange(AI->getValOperand()); + if (BitChange.second == UndefBit || !AI->hasOneUse() || + I->getOpcode() != Instruction::And || + AI->getType()->getPrimitiveSizeInBits() == 8 || AI->getParent() != I->getParent()) return AtomicExpansionKind::CmpXChg; + + assert(I->getOperand(0) == AI); // The following instruction must be a AND single bit. - auto *C2 = dyn_cast(I->getOperand(1)); - unsigned Bits = AI->getType()->getPrimitiveSizeInBits(); - if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue())) + if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) { + auto *C1 = dyn_cast(AI->getValOperand()); + assert(C1 != nullptr); + auto *C2 = dyn_cast(I->getOperand(1)); + if (!C2 || !isPowerOf2_64(C2->getZExtValue())) { + return AtomicExpansionKind::CmpXChg; + } + if (AI->getOperation() == AtomicRMWInst::And) { + return ~C1->getValue() == C2->getValue() + ? AtomicExpansionKind::BitTestIntrinsic + : AtomicExpansionKind::CmpXChg; + } + return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic + : AtomicExpansionKind::CmpXChg; + } + + assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit); + + auto BitTested = FindSingleBitChange(I->getOperand(1)); + if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit) + return AtomicExpansionKind::CmpXChg; + + assert(BitChange.first != nullptr && BitTested.first != nullptr); + + // If shift amounts are not the same we can't use BitTestIntrinsic. + if (BitChange.first != BitTested.first) return AtomicExpansionKind::CmpXChg; + // If atomic AND need to be masking all be one bit and testing the one bit + // unset in the mask. if (AI->getOperation() == AtomicRMWInst::And) - return ~C1->getValue() == C2->getValue() + return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit) ? AtomicExpansionKind::BitTestIntrinsic : AtomicExpansionKind::CmpXChg; - return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic - : AtomicExpansionKind::CmpXChg; + // If atomic XOR/OR need to be setting and testing the same bit. + return (BitChange.second == ShiftBit && BitTested.second == ShiftBit) + ? AtomicExpansionKind::BitTestIntrinsic + : AtomicExpansionKind::CmpXChg; } void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); - Intrinsic::ID IID = Intrinsic::not_intrinsic; + Intrinsic::ID IID_C = Intrinsic::not_intrinsic; + Intrinsic::ID IID_I = Intrinsic::not_intrinsic; switch (AI->getOperation()) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Or: - IID = Intrinsic::x86_atomic_bts; + IID_C = Intrinsic::x86_atomic_bts; + IID_I = Intrinsic::x86_atomic_bts_rm; break; case AtomicRMWInst::Xor: - IID = Intrinsic::x86_atomic_btc; + IID_C = Intrinsic::x86_atomic_btc; + IID_I = Intrinsic::x86_atomic_btc_rm; break; case AtomicRMWInst::And: - IID = Intrinsic::x86_atomic_btr; + IID_C = Intrinsic::x86_atomic_btr; + IID_I = Intrinsic::x86_atomic_btr_rm; break; } Instruction *I = AI->user_back(); LLVMContext &Ctx = AI->getContext(); - unsigned Imm = - countTrailingZeros(cast(I->getOperand(1))->getZExtValue()); - Function *BitTest = - Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), Type::getInt8PtrTy(Ctx)); - Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); + Function *BitTest = nullptr; + Value *Result = nullptr; + auto BitTested = FindSingleBitChange(AI->getValOperand()); + assert(BitTested.first != nullptr); + if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { + auto *C = dyn_cast(I->getOperand(1)); + assert(C != nullptr); + + BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType()); + + unsigned Imm = countTrailingZeros(C->getZExtValue()); + Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); + } else { + BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType()); + + assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit); + + Value *SI = BitTested.first; + assert(SI != nullptr); + + // BT{S|R|C} on memory operand don't modulo bit position so we need to + // mask it. + unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits(); + Value *BitPos = + Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1)); + // Todo(1): In many cases it may be provable that SI is less than + // ShiftBits in which case this mask is unnecessary + // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1 + // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in + // favor of just a raw BT{S|R|C}. + + Result = Builder.CreateCall(BitTest, {Addr, BitPos}); + Result = Builder.CreateZExtOrTrunc(Result, AI->getType()); + + // If the result is only used for zero/non-zero status then we don't need to + // shift value back. Otherwise do so. + for (auto It = I->user_begin(); It != I->user_end(); ++It) { + if (auto *ICmp = dyn_cast(*It)) { + if (ICmp->isEquality()) { + auto *C0 = dyn_cast(ICmp->getOperand(0)); + auto *C1 = dyn_cast(ICmp->getOperand(1)); + if (C0 || C1) { + assert(C0 == nullptr || C1 == nullptr); + if ((C0 ? C0 : C1)->isZero()) + continue; + } + } + } + Result = Builder.CreateShl(Result, BitPos); + break; + } + } + I->replaceAllUsesWith(Result); I->eraseFromParent(); AI->eraseFromParent(); @@ -34242,6 +34429,9 @@ NODE_NAME_CASE(LBTS) NODE_NAME_CASE(LBTC) NODE_NAME_CASE(LBTR) + NODE_NAME_CASE(LBTS_RM) + NODE_NAME_CASE(LBTC_RM) + NODE_NAME_CASE(LBTR_RM) NODE_NAME_CASE(AADD) NODE_NAME_CASE(AOR) NODE_NAME_CASE(AXOR) diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -864,6 +864,17 @@ def x86btr : SDNode<"X86ISD::LBTR", X86LBTest, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def X86LBTestRM : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, + SDTCisInt<2>]>; + +def x86_rm_bts : SDNode<"X86ISD::LBTS_RM", X86LBTestRM, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def x86_rm_btc : SDNode<"X86ISD::LBTC_RM", X86LBTestRM, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def x86_rm_btr : SDNode<"X86ISD::LBTR_RM", X86LBTestRM, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; + + multiclass ATOMIC_LOGIC_OP { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteBitTestSetRegRMW] in { @@ -882,10 +893,33 @@ } } +multiclass ATOMIC_LOGIC_OP_RM Opc8, string s> { + let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteBitTestSetRegRMW] in { + def 16rm : Ii8("x86_rm_" # s) addr:$src1, GR16:$src2))]>, + OpSize16, TB, LOCK; + def 32rm : Ii8("x86_rm_" # s) addr:$src1, GR32:$src2))]>, + OpSize32, TB, LOCK; + def 64rm : RIi8("x86_rm_" # s) addr:$src1, GR64:$src2))]>, + TB, LOCK; + } +} + + defm LOCK_BTS : ATOMIC_LOGIC_OP; defm LOCK_BTC : ATOMIC_LOGIC_OP; defm LOCK_BTR : ATOMIC_LOGIC_OP; +defm LOCK_BTS_RM : ATOMIC_LOGIC_OP_RM<0xAB, "bts">; +defm LOCK_BTC_RM : ATOMIC_LOGIC_OP_RM<0xBB, "btc">; +defm LOCK_BTR_RM : ATOMIC_LOGIC_OP_RM<0xB3, "btr">; + // Atomic compare and swap. multiclass LCMPXCHG_BinOp Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag> { diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll @@ -5,19 +5,12 @@ ; CHECK-LABEL: atomic_shl1_xor_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btcq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: xorq %rdx, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %shl = shl nuw i64 1, %c @@ -85,20 +78,12 @@ ; CHECK-LABEL: atomic_shl1_small_mask_xor_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: andb $31, %cl -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $31, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btcq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB3_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: xorq %rdx, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB3_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 31 @@ -112,21 +97,12 @@ ; CHECK-LABEL: atomic_shl1_mask0_xor_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB4_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: xorq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB4_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btcq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 63 @@ -141,21 +117,12 @@ ; CHECK-LABEL: atomic_shl1_mask1_xor_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB5_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: xorq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB5_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btcq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %shl = shl nuw i64 1, %c @@ -170,19 +137,12 @@ ; CHECK-LABEL: atomic_shl1_mask01_xor_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btcq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB6_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: xorq %rdx, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB6_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 63 @@ -701,21 +661,12 @@ ; CHECK-LABEL: atomic_shl1_and_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btrq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB24_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB24_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %shl = shl nuw i64 1, %c @@ -788,22 +739,12 @@ ; CHECK-LABEL: atomic_shl1_small_mask_and_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: andb $31, %cl -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi +; CHECK-NEXT: andl $31, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btrq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB27_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB27_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 31 @@ -818,21 +759,12 @@ ; CHECK-LABEL: atomic_shl1_mask0_and_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movq $-2, %rdx -; CHECK-NEXT: rolq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB28_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB28_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btrq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 63 @@ -848,21 +780,12 @@ ; CHECK-LABEL: atomic_shl1_mask1_and_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movq $-2, %rdx -; CHECK-NEXT: rolq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB29_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB29_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $1, %edx +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btrq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %shl = shl nuw i64 1, %c @@ -878,21 +801,12 @@ ; CHECK-LABEL: atomic_shl1_mask01_and_64_gpr_val: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi +; CHECK-NEXT: andl $63, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: lock btrq %rcx, (%rdi) +; CHECK-NEXT: setb %al ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB30_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) -; CHECK-NEXT: jne .LBB30_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: shlq %cl, %rax ; CHECK-NEXT: retq entry: %rem = and i64 %c, 63 @@ -1185,26 +1099,14 @@ define i64 @atomic_shl1_and_64_gpr_brnz(ptr %v, i64 %c) nounwind { ; CHECK-LABEL: atomic_shl1_and_64_gpr_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB40_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: andq %rsi, %r8 -; CHECK-NEXT: lock cmpxchgq %r8, (%rdi) -; CHECK-NEXT: jne .LBB40_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: testq %rdx, %rax -; CHECK-NEXT: je .LBB40_3 -; CHECK-NEXT: # %bb.4: # %if.then -; CHECK-NEXT: movq (%rdi,%rcx,8), %rax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: lock btrq %rax, (%rdi) +; CHECK-NEXT: jae .LBB40_1 +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB40_3: +; CHECK-NEXT: .LBB40_1: ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: retq entry: @@ -1316,27 +1218,13 @@ define i64 @atomic_shl1_small_mask_and_64_gpr_brnz(ptr %v, i64 %c) nounwind { ; CHECK-LABEL: atomic_shl1_small_mask_and_64_gpr_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: andl $31, %ecx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB43_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: andq %rsi, %r8 -; CHECK-NEXT: lock cmpxchgq %r8, (%rdi) -; CHECK-NEXT: jne .LBB43_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: testl %edx, %eax -; CHECK-NEXT: je .LBB43_3 -; CHECK-NEXT: # %bb.4: # %if.then -; CHECK-NEXT: movq (%rdi,%rcx,8), %rax +; CHECK-NEXT: andl $31, %esi +; CHECK-NEXT: lock btrq %rsi, (%rdi) +; CHECK-NEXT: jae .LBB43_1 +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB43_3: +; CHECK-NEXT: .LBB43_1: ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: retq entry: @@ -1361,24 +1249,14 @@ define i64 @atomic_shl1_mask0_and_64_gpr_brnz(ptr %v, i64 %c) nounwind { ; CHECK-LABEL: atomic_shl1_mask0_and_64_gpr_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movq $-2, %rdx -; CHECK-NEXT: rolq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB44_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB44_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: btq %rcx, %rax -; CHECK-NEXT: jae .LBB44_3 -; CHECK-NEXT: # %bb.4: # %if.then -; CHECK-NEXT: movq (%rdi,%rcx,8), %rax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: lock btrq %rax, (%rdi) +; CHECK-NEXT: jae .LBB44_1 +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB44_3: +; CHECK-NEXT: .LBB44_1: ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: retq entry: @@ -1404,24 +1282,14 @@ define i64 @atomic_shl1_mask1_and_64_gpr_brnz(ptr %v, i64 %c) nounwind { ; CHECK-LABEL: atomic_shl1_mask1_and_64_gpr_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movq $-2, %rdx -; CHECK-NEXT: rolq %cl, %rdx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB45_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %rdx, %rsi -; CHECK-NEXT: lock cmpxchgq %rsi, (%rdi) -; CHECK-NEXT: jne .LBB45_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: btq %rcx, %rax -; CHECK-NEXT: jae .LBB45_3 -; CHECK-NEXT: # %bb.4: # %if.then -; CHECK-NEXT: movq (%rdi,%rcx,8), %rax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: lock btrq %rax, (%rdi) +; CHECK-NEXT: jae .LBB45_1 +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB45_3: +; CHECK-NEXT: .LBB45_1: ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: retq entry: @@ -1447,26 +1315,14 @@ define i64 @atomic_shl1_mask01_and_64_gpr_brnz(ptr %v, i64 %c) nounwind { ; CHECK-LABEL: atomic_shl1_mask01_and_64_gpr_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: movl $1, %edx -; CHECK-NEXT: shlq %cl, %rdx -; CHECK-NEXT: movq $-2, %rsi -; CHECK-NEXT: rolq %cl, %rsi -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB46_1: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: andq %rsi, %r8 -; CHECK-NEXT: lock cmpxchgq %r8, (%rdi) -; CHECK-NEXT: jne .LBB46_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: testq %rdx, %rax -; CHECK-NEXT: je .LBB46_3 -; CHECK-NEXT: # %bb.4: # %if.then -; CHECK-NEXT: movq (%rdi,%rcx,8), %rax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: lock btrq %rax, (%rdi) +; CHECK-NEXT: jae .LBB46_1 +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB46_3: +; CHECK-NEXT: .LBB46_1: ; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -751,49 +751,26 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_xor_16_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andb $7, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl (%edx), %eax -; X86-NEXT: movzwl %si, %ecx -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB13_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %si, (%edx) -; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $7, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btcw %cx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_xor_16_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andb $7, %cl -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $7, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btcw %cx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl %dx, %ecx -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB13_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %edx -; X64-NEXT: xorl %ecx, %edx -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: lock cmpxchgw %dx, (%rdi) -; X64-NEXT: # kill: def $ax killed $ax def $eax -; X64-NEXT: jne .LBB13_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %ecx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq entry: @@ -936,47 +913,26 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzwl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB16_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%esi) -; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: jne .LBB16_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btcw %cx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $15, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btcw %cx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB16_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: lock cmpxchgw %cx, (%rdi) -; X64-NEXT: # kill: def $ax killed $ax def $eax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq entry: @@ -2384,56 +2340,27 @@ define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_and_16_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andb $7, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movw $-2, %di -; X86-NEXT: rolw %cl, %di -; X86-NEXT: movzwl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB37_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl %edi, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $7, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btrw %cx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) -; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: jne .LBB37_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzwl %si, %ecx -; X86-NEXT: andl %eax, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_and_16_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andb $7, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movw $-2, %si +; X64-NEXT: andl $7, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btrw %cx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: rolw %cl, %si -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB37_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl %esi, %ecx +; X64-NEXT: shll %cl, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: lock cmpxchgw %cx, (%rdi) -; X64-NEXT: # kill: def $ax killed $ax def $eax -; X64-NEXT: jne .LBB37_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzwl %dx, %ecx -; X64-NEXT: andl %eax, %ecx -; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %0 = and i16 %c, 7 @@ -2575,55 +2502,26 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_and_16_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movw $-2, %di -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: rolw %cl, %di -; X86-NEXT: movzwl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB40_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%esi) -; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: jne .LBB40_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btrw %cx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movw $-2, %r8w -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: rolw %cl, %r8w -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB40_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl %r8d, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: lock cmpxchgw %cx, (%rdi) -; X64-NEXT: # kill: def $ax killed $ax def $eax -; X64-NEXT: jne .LBB40_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %edx, %eax +; X64-NEXT: andl $15, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btrw %cx, (%rdi) +; X64-NEXT: setb %al +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq entry: @@ -3855,40 +3753,25 @@ define i32 @atomic_shl1_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_or_32_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB60_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: lock cmpxchgl %ecx, (%esi) -; X86-NEXT: jne .LBB60_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $31, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btsl %ecx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_or_32_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $31, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btsl %ecx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB60_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: orl %edx, %ecx -; X64-NEXT: lock cmpxchgl %ecx, (%rdi) -; X64-NEXT: jne .LBB60_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: retq entry: %shl = shl nuw i32 1, %c @@ -3900,42 +3783,25 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB61_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: lock cmpxchgl %ecx, (%edx) -; X86-NEXT: jne .LBB61_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %esi, %eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btsl %ecx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $15, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btsl %ecx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB61_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: orl %edx, %ecx -; X64-NEXT: lock cmpxchgl %ecx, (%rdi) -; X64-NEXT: jne .LBB61_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: retq entry: %0 = and i32 %c, 15 @@ -3948,47 +3814,25 @@ define i32 @atomic_shl1_mask0_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB62_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB62_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $31, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btsl %ecx, (%edx) +; X86-NEXT: setb %al ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB62_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB62_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $31, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btsl %ecx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: retq entry: %0 = and i32 %c, 31 @@ -4002,47 +3846,25 @@ define i32 @atomic_shl1_mask1_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB63_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB63_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $1, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $31, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btsl %ecx, (%edx) +; X86-NEXT: setb %al ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB63_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB63_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $31, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btsl %ecx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: retq entry: %shl = shl nuw i32 1, %c @@ -4056,40 +3878,25 @@ define i32 @atomic_shl1_mask01_or_32_gpr_val(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_val: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB64_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: lock cmpxchgl %ecx, (%esi) -; X86-NEXT: jne .LBB64_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: andl %edx, %eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $31, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: lock btsl %ecx, (%edx) +; X86-NEXT: setb %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_val: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx +; X64-NEXT: andl $31, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: lock btsl %ecx, (%rdi) +; X64-NEXT: setb %al ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB64_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: orl %edx, %ecx -; X64-NEXT: lock cmpxchgl %ecx, (%rdi) -; X64-NEXT: jne .LBB64_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: andl %edx, %eax +; X64-NEXT: shll %cl, %eax ; X64-NEXT: retq entry: %0 = and i32 %c, 31 @@ -4807,54 +4614,30 @@ define i32 @atomic_shl1_or_32_gpr_br(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_or_32_gpr_br: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB78_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB78_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB78_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB78_5 -; X86-NEXT: .LBB78_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB78_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB78_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB78_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_or_32_gpr_br: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB78_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB78_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB78_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB78_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB78_3: +; X64-NEXT: .LBB78_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -4878,56 +4661,28 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_br(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_br: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $15, %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB79_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB79_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB79_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB79_5 -; X86-NEXT: .LBB79_3: +; X86-NEXT: lock btsl %ecx, (%eax) +; X86-NEXT: jae .LBB79_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%eax,%ecx,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB79_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB79_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_br: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB79_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB79_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB79_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: andl $15, %esi +; X64-NEXT: lock btsl %esi, (%rdi) +; X64-NEXT: jae .LBB79_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB79_3: +; X64-NEXT: .LBB79_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -4952,54 +4707,30 @@ define i32 @atomic_shl1_mask0_or_32_gpr_br(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_br: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB80_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB80_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jae .LBB80_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB80_5 -; X86-NEXT: .LBB80_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB80_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB80_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB80_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_br: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB80_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB80_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB80_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB80_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB80_3: +; X64-NEXT: .LBB80_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5025,54 +4756,30 @@ define i32 @atomic_shl1_mask1_or_32_gpr_br(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_br: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB81_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB81_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jae .LBB81_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB81_5 -; X86-NEXT: .LBB81_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB81_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB81_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB81_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_br: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB81_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB81_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB81_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB81_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB81_3: +; X64-NEXT: .LBB81_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5098,54 +4805,30 @@ define i32 @atomic_shl1_mask01_or_32_gpr_br(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_br: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB82_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB82_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB82_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB82_5 -; X86-NEXT: .LBB82_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB82_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB82_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB82_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_br: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB82_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB82_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB82_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB82_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB82_3: +; X64-NEXT: .LBB82_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5243,56 +4926,31 @@ define i32 @atomic_shl1_or_32_gpr_brz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_or_32_gpr_brz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl $1, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB84_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: lock cmpxchgl %edx, (%esi) -; X86-NEXT: jne .LBB84_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $123, %edx -; X86-NEXT: testl %edi, %eax -; X86-NEXT: jne .LBB84_4 -; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movl (%esi,%ecx,4), %edx -; X86-NEXT: .LBB84_4: # %return -; X86-NEXT: movl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $31, %eax +; X86-NEXT: lock btsl %eax, (%edx) +; X86-NEXT: movl $123, %eax +; X86-NEXT: jae .LBB84_1 +; X86-NEXT: # %bb.2: # %return +; X86-NEXT: retl +; X86-NEXT: .LBB84_1: # %if.then +; X86-NEXT: movl (%edx,%ecx,4), %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_or_32_gpr_brz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB84_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %edx -; X64-NEXT: orl %esi, %edx -; X64-NEXT: lock cmpxchgl %edx, (%rdi) -; X64-NEXT: jne .LBB84_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $123, %edx -; X64-NEXT: testl %esi, %eax -; X64-NEXT: je .LBB84_3 -; X64-NEXT: # %bb.4: # %return -; X64-NEXT: movl %edx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: movl $123, %eax +; X64-NEXT: jae .LBB84_1 +; X64-NEXT: # %bb.2: # %return ; X64-NEXT: retq -; X64-NEXT: .LBB84_3: # %if.then -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: movl (%rdi,%rax,4), %edx -; X64-NEXT: movl %edx, %eax +; X64-NEXT: .LBB84_1: # %if.then +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %shl = shl nuw i32 1, %c @@ -5315,58 +4973,29 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_brz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $15, %ecx -; X86-NEXT: movl $1, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB85_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: lock cmpxchgl %edx, (%esi) -; X86-NEXT: jne .LBB85_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $123, %edx -; X86-NEXT: testl %edi, %eax -; X86-NEXT: jne .LBB85_4 -; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movl (%esi,%ecx,4), %edx -; X86-NEXT: .LBB85_4: # %return -; X86-NEXT: movl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: movl $123, %eax +; X86-NEXT: jae .LBB85_1 +; X86-NEXT: # %bb.2: # %return +; X86-NEXT: retl +; X86-NEXT: .LBB85_1: # %if.then +; X86-NEXT: movl (%ecx,%edx,4), %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movl $1, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB85_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %edx -; X64-NEXT: orl %esi, %edx -; X64-NEXT: lock cmpxchgl %edx, (%rdi) -; X64-NEXT: jne .LBB85_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $123, %edx -; X64-NEXT: testl %esi, %eax -; X64-NEXT: je .LBB85_3 -; X64-NEXT: # %bb.4: # %return -; X64-NEXT: movl %edx, %eax +; X64-NEXT: andl $15, %esi +; X64-NEXT: lock btsl %esi, (%rdi) +; X64-NEXT: movl $123, %eax +; X64-NEXT: jae .LBB85_1 +; X64-NEXT: # %bb.2: # %return ; X64-NEXT: retq -; X64-NEXT: .LBB85_3: # %if.then -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: movl (%rdi,%rax,4), %edx -; X64-NEXT: movl %edx, %eax +; X64-NEXT: .LBB85_1: # %if.then +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %0 = and i32 %c, 15 @@ -5390,56 +5019,31 @@ define i32 @atomic_shl1_mask0_or_32_gpr_brz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB86_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: lock cmpxchgl %edi, (%esi) -; X86-NEXT: jne .LBB86_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $123, %edx -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jb .LBB86_4 -; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movl (%esi,%ecx,4), %edx -; X86-NEXT: .LBB86_4: # %return -; X86-NEXT: movl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $31, %eax +; X86-NEXT: lock btsl %eax, (%edx) +; X86-NEXT: movl $123, %eax +; X86-NEXT: jae .LBB86_1 +; X86-NEXT: # %bb.2: # %return +; X86-NEXT: retl +; X86-NEXT: .LBB86_1: # %if.then +; X86-NEXT: movl (%edx,%ecx,4), %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB86_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB86_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $123, %edx -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB86_3 -; X64-NEXT: # %bb.4: # %return -; X64-NEXT: movl %edx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: movl $123, %eax +; X64-NEXT: jae .LBB86_1 +; X64-NEXT: # %bb.2: # %return ; X64-NEXT: retq -; X64-NEXT: .LBB86_3: # %if.then -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: movl (%rdi,%rax,4), %edx -; X64-NEXT: movl %edx, %eax +; X64-NEXT: .LBB86_1: # %if.then +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %rem = and i32 %c, 31 @@ -5464,56 +5068,31 @@ define i32 @atomic_shl1_mask1_or_32_gpr_brz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB87_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: lock cmpxchgl %edi, (%esi) -; X86-NEXT: jne .LBB87_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $123, %edx -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jb .LBB87_4 -; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movl (%esi,%ecx,4), %edx -; X86-NEXT: .LBB87_4: # %return -; X86-NEXT: movl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $31, %eax +; X86-NEXT: lock btsl %eax, (%edx) +; X86-NEXT: movl $123, %eax +; X86-NEXT: jae .LBB87_1 +; X86-NEXT: # %bb.2: # %return +; X86-NEXT: retl +; X86-NEXT: .LBB87_1: # %if.then +; X86-NEXT: movl (%edx,%ecx,4), %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB87_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB87_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $123, %edx -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB87_3 -; X64-NEXT: # %bb.4: # %return -; X64-NEXT: movl %edx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: movl $123, %eax +; X64-NEXT: jae .LBB87_1 +; X64-NEXT: # %bb.2: # %return ; X64-NEXT: retq -; X64-NEXT: .LBB87_3: # %if.then -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: movl (%rdi,%rax,4), %edx -; X64-NEXT: movl %edx, %eax +; X64-NEXT: .LBB87_1: # %if.then +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %shl = shl nuw i32 1, %c @@ -5538,56 +5117,31 @@ define i32 @atomic_shl1_mask01_or_32_gpr_brz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl $1, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB88_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: lock cmpxchgl %edx, (%esi) -; X86-NEXT: jne .LBB88_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movl $123, %edx -; X86-NEXT: testl %edi, %eax -; X86-NEXT: jne .LBB88_4 -; X86-NEXT: # %bb.3: # %if.then -; X86-NEXT: movl (%esi,%ecx,4), %edx -; X86-NEXT: .LBB88_4: # %return -; X86-NEXT: movl %edx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $31, %eax +; X86-NEXT: lock btsl %eax, (%edx) +; X86-NEXT: movl $123, %eax +; X86-NEXT: jae .LBB88_1 +; X86-NEXT: # %bb.2: # %return +; X86-NEXT: retl +; X86-NEXT: .LBB88_1: # %if.then +; X86-NEXT: movl (%edx,%ecx,4), %eax ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB88_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %edx -; X64-NEXT: orl %esi, %edx -; X64-NEXT: lock cmpxchgl %edx, (%rdi) -; X64-NEXT: jne .LBB88_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movl $123, %edx -; X64-NEXT: testl %esi, %eax -; X64-NEXT: je .LBB88_3 -; X64-NEXT: # %bb.4: # %return -; X64-NEXT: movl %edx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: movl $123, %eax +; X64-NEXT: jae .LBB88_1 +; X64-NEXT: # %bb.2: # %return ; X64-NEXT: retq -; X64-NEXT: .LBB88_3: # %if.then -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: movl (%rdi,%rax,4), %edx -; X64-NEXT: movl %edx, %eax +; X64-NEXT: .LBB88_1: # %if.then +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %rem = and i32 %c, 31 @@ -5685,54 +5239,30 @@ define i32 @atomic_shl1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_or_32_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB90_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB90_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB90_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB90_5 -; X86-NEXT: .LBB90_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB90_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB90_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB90_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_or_32_gpr_brnz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB90_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB90_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB90_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB90_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB90_3: +; X64-NEXT: .LBB90_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5756,56 +5286,28 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_brnz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $15, %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB91_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB91_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB91_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB91_5 -; X86-NEXT: .LBB91_3: +; X86-NEXT: lock btsl %ecx, (%eax) +; X86-NEXT: jae .LBB91_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%eax,%ecx,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB91_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB91_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $15, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB91_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB91_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB91_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: andl $15, %esi +; X64-NEXT: lock btsl %esi, (%rdi) +; X64-NEXT: jae .LBB91_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB91_3: +; X64-NEXT: .LBB91_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5830,54 +5332,30 @@ define i32 @atomic_shl1_mask0_or_32_gpr_brnz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB92_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB92_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jae .LBB92_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB92_5 -; X86-NEXT: .LBB92_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB92_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB92_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB92_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brnz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB92_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB92_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB92_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB92_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB92_3: +; X64-NEXT: .LBB92_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5903,54 +5381,30 @@ define i32 @atomic_shl1_mask1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB93_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB93_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: jae .LBB93_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB93_5 -; X86-NEXT: .LBB93_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB93_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB93_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB93_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brnz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB93_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB93_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: btl %ecx, %eax -; X64-NEXT: jae .LBB93_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB93_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB93_3: +; X64-NEXT: .LBB93_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: @@ -5976,54 +5430,30 @@ define i32 @atomic_shl1_mask01_or_32_gpr_brnz(ptr %v, i32 %c) nounwind { ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB94_1: # %atomicrmw.start -; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: lock cmpxchgl %edi, (%edx) -; X86-NEXT: jne .LBB94_1 -; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: testl %esi, %eax -; X86-NEXT: je .LBB94_3 -; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movl (%edx,%ecx,4), %eax -; X86-NEXT: jmp .LBB94_5 -; X86-NEXT: .LBB94_3: +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $31, %edx +; X86-NEXT: lock btsl %edx, (%ecx) +; X86-NEXT: jae .LBB94_1 +; X86-NEXT: # %bb.2: # %if.then +; X86-NEXT: movl (%ecx,%eax,4), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB94_1: ; X86-NEXT: movl $123, %eax -; X86-NEXT: .LBB94_5: # %return -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brnz: ; X64: # %bb.0: # %entry -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB94_1: # %atomicrmw.start -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, %esi -; X64-NEXT: orl %edx, %esi -; X64-NEXT: lock cmpxchgl %esi, (%rdi) -; X64-NEXT: jne .LBB94_1 -; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: testl %edx, %eax -; X64-NEXT: je .LBB94_3 -; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $31, %eax +; X64-NEXT: lock btsl %eax, (%rdi) +; X64-NEXT: jae .LBB94_1 +; X64-NEXT: # %bb.2: # %if.then +; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq -; X64-NEXT: .LBB94_3: +; X64-NEXT: .LBB94_1: ; X64-NEXT: movl $123, %eax ; X64-NEXT: retq entry: