diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -70,6 +70,14 @@
                                      [ImmArg<ArgIndex<1>>]>;
   def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
                                      [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_bts_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+  def int_x86_atomic_btc_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+  def int_x86_atomic_btr_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+
+
 }
 
 // Lock binary arith with CC.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -799,6 +799,9 @@
     LBTS,
     LBTC,
     LBTR,
+    LBTS_RM,
+    LBTC_RM,
+    LBTR_RM,
 
     /// RAO arithmetic instructions.
     /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5642,6 +5642,9 @@
       return true;
     case Intrinsic::x86_cmpccxadd32:
     case Intrinsic::x86_cmpccxadd64:
+    case Intrinsic::x86_atomic_bts_rm:
+    case Intrinsic::x86_atomic_btc_rm:
+    case Intrinsic::x86_atomic_btr_rm:
     case Intrinsic::x86_atomic_bts:
     case Intrinsic::x86_atomic_btc:
     case Intrinsic::x86_atomic_btr: {
@@ -5654,6 +5657,7 @@
                     MachineMemOperand::MOVolatile;
       return true;
     }
+
     case Intrinsic::x86_aadd32:
     case Intrinsic::x86_aadd64:
     case Intrinsic::x86_aand32:
@@ -28358,6 +28362,25 @@
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_atomic_bts_rm:
+    case Intrinsic::x86_atomic_btc_rm:
+    case Intrinsic::x86_atomic_btr_rm: {
+      SDLoc DL(Op);
+      MVT VT = Op.getSimpleValueType();
+      SDValue Chain = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(2);
+      SDValue Op2 = Op.getOperand(3);
+      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm   ? X86ISD::LBTS_RM
+                     : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
+                                                             : X86ISD::LBTR_RM;
+      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
+      SDValue Res =
+          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
+                                  {Chain, Op1, Op2}, VT, MMO);
+      Chain = Res.getValue(1);
+      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
+    }
     case Intrinsic::x86_atomic_bts:
     case Intrinsic::x86_atomic_btc:
     case Intrinsic::x86_atomic_btr: {
@@ -28369,6 +28392,7 @@
       unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS
                      : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
                                                           : X86ISD::LBTR;
+
       SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
       SDValue Res =
@@ -31395,6 +31419,96 @@
                                  : AtomicExpansionKind::None;
 }
 
+enum BitTestKind : unsigned {
+  UndefBit,
+  ConstantBit,
+  NotConstantBit,
+  ShiftBit,
+  NotShiftBit
+};
+
+static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
+  BitTestKind BTK = UndefBit;
+  auto *C = dyn_cast<ConstantInt>(V);
+  if (C) {
+    // Check if V is a power of 2 or or NOT power of 2.
+    if (isPowerOf2_64(C->getZExtValue())) {
+      BTK = ConstantBit;
+    } else if (isPowerOf2_64((~C->getValue()).getZExtValue())) {
+      BTK = NotConstantBit;
+    }
+    return {V, BTK};
+  }
+
+  // Check if V is some power of 2 pattern known to be non-zero
+  auto *I = dyn_cast<Instruction>(V);
+  if (I) {
+    bool Not = false;
+    // Check if we have a NOT
+    if (I->getOpcode() == Instruction::Sub ||
+        I->getOpcode() == Instruction::Xor) {
+
+      auto *OpC0 = dyn_cast<ConstantInt>(I->getOperand(0));
+      auto *OpC1 = dyn_cast<ConstantInt>(I->getOperand(1));
+      // Check if this is a NOT instruction: -1 - X or X/-1 ^ -1/X
+      if (!OpC0 && (!OpC1 || I->getOpcode() == Instruction::Sub))
+        return {nullptr, UndefBit};
+
+      auto *MaybeNeg1 = OpC0 ? OpC0 : OpC1;
+      if (!MaybeNeg1->isMinusOne())
+        return {nullptr, UndefBit};
+
+      auto *OpI0 = dyn_cast<Instruction>(I->getOperand(0));
+      auto *OpI1 = dyn_cast<Instruction>(I->getOperand(1));
+
+      assert(OpI0 != nullptr || OpI1 != nullptr);
+      assert(OpI0 == nullptr || OpI1 == nullptr);
+
+      I = OpI0 ? OpI0 : OpI1;
+      Not = true;
+    }
+    // We can only use 1 << X without more sophisticated analysis. C << X where
+    // C is a power of 2 but not 1 can result in zero which cannot be translated
+    // to bittest. Likewise any C >> X (either arith or logical) can be zero.
+    if (I->getOpcode() == Instruction::Shl) {
+      // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
+      // -X` and some other provable power of 2 patterns that we can use CTZ on
+      // may be profitable.
+      // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
+      // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
+      // be provably a non-zero power of 2.
+      // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
+      // transformable to bittest.
+      auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
+      if (!ShiftVal)
+        return {nullptr, UndefBit};
+      if (ShiftVal->equalsInt(1))
+        BTK = Not ? NotShiftBit : ShiftBit;
+
+      if (BTK == UndefBit)
+        return {nullptr, UndefBit};
+
+      Value *BitV = I->getOperand(1);
+      if (auto *I1 = dyn_cast<Instruction>(BitV)) {
+        // Read past a shiftmask instruction to find count
+        if (I1->getOpcode() == Instruction::And) {
+          auto *OpC0 = dyn_cast<ConstantInt>(I1->getOperand(0));
+          auto *OpC1 = dyn_cast<ConstantInt>(I1->getOperand(1));
+          if (OpC0 || OpC1) {
+            assert(OpC0 == nullptr || OpC1 == nullptr);
+            auto *C1 = OpC0 ? OpC0 : OpC1;
+            if (C1->equalsInt(I->getType()->getPrimitiveSizeInBits() - 1))
+              BitV = OpC0 ? I1->getOperand(1) : I1->getOperand(0);
+          }
+        }
+      }
+
+      return {BitV, BTK};
+    }
+  }
+  return {nullptr, UndefBit};
+}
+
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
   // If the atomicrmw's result isn't actually used, we can just add a "lock"
@@ -31404,51 +31518,138 @@
 
   // If the atomicrmw's result is used by a single bit AND, we may use
   // bts/btr/btc instruction for these operations.
-  auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+  // Note: InstCombinePass can cause a de-optimization here. It replaces the
+  // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
+  // (depending on CC). This pattern can only use bts/btr/btc but we don't
+  // detect it.
   Instruction *I = AI->user_back();
-  if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+  auto BitChange = FindSingleBitChange(AI->getValOperand());
+  if (!BitChange.first || BitChange.second == UndefBit || !AI->hasOneUse() ||
+      I->getOpcode() != Instruction::And ||
+      AI->getType()->getPrimitiveSizeInBits() == 8 ||
       AI->getParent() != I->getParent())
     return AtomicExpansionKind::CmpXChg;
+
+  assert(I->getOperand(0) == AI);
   // The following instruction must be a AND single bit.
-  auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
-  unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
-  if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+  if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
+    auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+    assert(C1 != nullptr);
+    auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
+      return AtomicExpansionKind::CmpXChg;
+    }
+    if (AI->getOperation() == AtomicRMWInst::And) {
+      return ~C1->getValue() == C2->getValue()
+                 ? AtomicExpansionKind::BitTestIntrinsic
+                 : AtomicExpansionKind::CmpXChg;
+    }
+    return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
+                    : AtomicExpansionKind::CmpXChg;
+  }
+
+  assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
+
+  auto BitTested = FindSingleBitChange(I->getOperand(1));
+  if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
+    return AtomicExpansionKind::CmpXChg;
+
+  assert(BitTested.first != nullptr);
+
+  // If shift amounts are not the same we can't use BitTestIntrinsic
+  if (BitChange.first != BitTested.first)
     return AtomicExpansionKind::CmpXChg;
 
+  // If atomic AND need to be masking all be one bit and testing the one bit
+  // unset in the mask
   if (AI->getOperation() == AtomicRMWInst::And)
-    return ~C1->getValue() == C2->getValue()
+    return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
                ? AtomicExpansionKind::BitTestIntrinsic
                : AtomicExpansionKind::CmpXChg;
 
-  return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
-                  : AtomicExpansionKind::CmpXChg;
+  // If atomic XOR/OR need to be setting and testing the same bit.
+  return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
+             ? AtomicExpansionKind::BitTestIntrinsic
+             : AtomicExpansionKind::CmpXChg;
 }
 
 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
-  Intrinsic::ID IID = Intrinsic::not_intrinsic;
+  Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
+  Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
   switch (AI->getOperation()) {
   default:
     llvm_unreachable("Unknown atomic operation");
   case AtomicRMWInst::Or:
-    IID = Intrinsic::x86_atomic_bts;
+    IID_C = Intrinsic::x86_atomic_bts;
+    IID_I = Intrinsic::x86_atomic_bts_rm;
     break;
   case AtomicRMWInst::Xor:
-    IID = Intrinsic::x86_atomic_btc;
+    IID_C = Intrinsic::x86_atomic_btc;
+    IID_I = Intrinsic::x86_atomic_btc_rm;
     break;
   case AtomicRMWInst::And:
-    IID = Intrinsic::x86_atomic_btr;
+    IID_C = Intrinsic::x86_atomic_btr;
+    IID_I = Intrinsic::x86_atomic_btr_rm;
     break;
   }
   Instruction *I = AI->user_back();
   LLVMContext &Ctx = AI->getContext();
-  unsigned Imm =
-      countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
-  Function *BitTest =
-      Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
                                           Type::getInt8PtrTy(Ctx));
-  Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  Function *BitTest = nullptr;
+  Value *Result = nullptr;
+  auto BitTested = FindSingleBitChange(AI->getValOperand());
+  assert(BitTested.first != nullptr);
+  if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
+    auto *C = dyn_cast<ConstantInt>(I->getOperand(1));
+    assert(C != nullptr);
+
+    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
+
+    unsigned Imm = countTrailingZeros(C->getZExtValue());
+    Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  } else {
+    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
+
+    assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
+
+    Value *SI = BitTested.first;
+    assert(SI != nullptr);
+
+    // BT{S|R|C} on memory operand don't modulo bit position so we need to
+    // mask it.
+    unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
+    Value *BitPos =
+        Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
+    // Todo(1): In many cases it may be provable that SI is less than
+    // ShiftBits in which case this mask is unnecessary
+    // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
+    // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
+    // favor of just a raw BT{S|R|C}.
+
+    Result = Builder.CreateCall(BitTest, {Addr, BitPos});
+    Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
+
+    // If the result is only used for zero/non-zero status then we don't need to
+    // shift value back. Otherwise do so.
+    for (auto It = I->user_begin(); It != I->user_end(); ++It) {
+      if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
+        if (ICmp->isEquality()) {
+          auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
+          auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
+          if (C0 || C1) {
+            assert(C0 == nullptr || C1 == nullptr);
+            if ((C0 ? C0 : C1)->isZero())
+              continue;
+          }
+        }
+      }
+      Result = Builder.CreateShl(Result, BitPos);
+      break;
+    }
+  }
+
   I->replaceAllUsesWith(Result);
   I->eraseFromParent();
   AI->eraseFromParent();
@@ -34221,6 +34422,9 @@
   NODE_NAME_CASE(LBTS)
   NODE_NAME_CASE(LBTC)
   NODE_NAME_CASE(LBTR)
+  NODE_NAME_CASE(LBTS_RM)
+  NODE_NAME_CASE(LBTC_RM)
+  NODE_NAME_CASE(LBTR_RM)
   NODE_NAME_CASE(AADD)
   NODE_NAME_CASE(AOR)
   NODE_NAME_CASE(AXOR)
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -869,6 +869,17 @@
 def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
                     [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
 
+def X86LBTestRM : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                       SDTCisInt<2>]>;
+
+def x86_rm_bts : SDNode<"X86ISD::LBTS_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86_rm_btc : SDNode<"X86ISD::LBTC_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86_rm_btr : SDNode<"X86ISD::LBTR_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+
 multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
   let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
       SchedRW = [WriteBitTestSetRegRMW]  in {
@@ -887,10 +898,33 @@
   }
 }
 
+multiclass ATOMIC_LOGIC_OP_RM<bits<8> Opc8, string s> {
+  let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+      SchedRW = [WriteBitTestSetRegRMW]  in {
+    def 16rm : Ii8<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                  !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR16:$src2))]>,
+               OpSize16, TB, LOCK;
+    def 32rm : Ii8<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                  !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR32:$src2))]>,
+               OpSize32, TB, LOCK;
+    def 64rm : RIi8<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                   !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
+                   [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR64:$src2))]>,
+               TB, LOCK;
+  }
+}
+
+
 defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
 defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
 defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
 
+defm LOCK_BTS_RM : ATOMIC_LOGIC_OP_RM<0xAB, "bts">;
+defm LOCK_BTC_RM : ATOMIC_LOGIC_OP_RM<0xBB, "btc">;
+defm LOCK_BTR_RM : ATOMIC_LOGIC_OP_RM<0xB3, "btr">;
+
 // Atomic compare and swap.
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -751,49 +751,26 @@
 define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
-; X86-NEXT:    movzwl %si, %ecx
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB13_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %si, (%edx)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB13_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_xor_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $7, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl %dx, %ecx
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB13_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %dx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB13_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -936,47 +913,26 @@
 define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB16_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB16_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB16_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl %edx, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB16_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -2384,56 +2340,27 @@
 define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movw $-2, %di
-; X86-NEXT:    rolw %cl, %di
-; X86-NEXT:    movzwl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB37_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB37_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %si, %ecx
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_and_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $7, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movw $-2, %si
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolw %cl, %si
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB37_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl %esi, %ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB37_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzwl %dx, %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i16 %c, 7
@@ -2575,55 +2502,26 @@
 define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movw $-2, %di
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    rolw %cl, %di
-; X86-NEXT:    movzwl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB40_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB40_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movw $-2, %r8w
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolw %cl, %r8w
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB40_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl %r8d, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB40_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw %cx, (%rdi)
+; X64-NEXT:    setb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -3855,40 +3753,25 @@
 define i32 @atomic_shl1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB60_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%esi)
-; X86-NEXT:    jne .LBB60_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB60_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB60_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -3900,42 +3783,25 @@
 define i32 @atomic_shl1_small_mask_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB61_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%edx)
-; X86-NEXT:    jne .LBB61_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB61_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB61_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 15
@@ -3948,47 +3814,25 @@
 define i32 @atomic_shl1_mask0_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB62_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB62_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB62_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB62_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 31
@@ -4002,47 +3846,25 @@
 define i32 @atomic_shl1_mask1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB63_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB63_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB63_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB63_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -4056,40 +3878,25 @@
 define i32 @atomic_shl1_mask01_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB64_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%esi)
-; X86-NEXT:    jne .LBB64_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB64_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB64_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 31
@@ -4807,54 +4614,30 @@
 define i32 @atomic_shl1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB78_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB78_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB78_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB78_5
-; X86-NEXT:  .LBB78_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB78_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB78_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB78_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB78_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB78_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB78_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB78_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB78_3:
+; X64-NEXT:  .LBB78_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -4878,56 +4661,28 @@
 define i32 @atomic_shl1_small_mask_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB79_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB79_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB79_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB79_5
-; X86-NEXT:  .LBB79_3:
+; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    jae .LBB79_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB79_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB79_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB79_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB79_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB79_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    jae .LBB79_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB79_3:
+; X64-NEXT:  .LBB79_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -4952,54 +4707,30 @@
 define i32 @atomic_shl1_mask0_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB80_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB80_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB80_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB80_5
-; X86-NEXT:  .LBB80_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB80_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB80_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB80_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB80_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB80_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB80_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB80_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB80_3:
+; X64-NEXT:  .LBB80_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5025,54 +4756,30 @@
 define i32 @atomic_shl1_mask1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB81_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB81_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB81_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB81_5
-; X86-NEXT:  .LBB81_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB81_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB81_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB81_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB81_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB81_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB81_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB81_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB81_3:
+; X64-NEXT:  .LBB81_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5098,54 +4805,30 @@
 define i32 @atomic_shl1_mask01_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB82_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB82_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB82_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB82_5
-; X86-NEXT:  .LBB82_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB82_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB82_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB82_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB82_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB82_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB82_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB82_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB82_3:
+; X64-NEXT:  .LBB82_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5243,56 +4926,31 @@
 define i32 @atomic_shl1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB84_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB84_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB84_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB84_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB84_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB84_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB84_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB84_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB84_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB84_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB84_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB84_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -5315,58 +4973,29 @@
 define i32 @atomic_shl1_small_mask_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB85_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB85_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB85_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB85_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $15, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB85_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB85_1: # %if.then
+; X86-NEXT:    movl (%ecx,%edx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB85_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB85_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB85_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB85_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB85_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB85_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 15
@@ -5390,56 +5019,31 @@
 define i32 @atomic_shl1_mask0_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB86_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%esi)
-; X86-NEXT:    jne .LBB86_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jb .LBB86_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB86_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB86_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB86_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB86_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB86_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB86_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB86_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB86_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB86_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %rem = and i32 %c, 31
@@ -5464,56 +5068,31 @@
 define i32 @atomic_shl1_mask1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB87_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%esi)
-; X86-NEXT:    jne .LBB87_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jb .LBB87_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB87_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB87_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB87_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB87_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB87_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB87_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB87_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB87_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB87_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -5538,56 +5117,31 @@
 define i32 @atomic_shl1_mask01_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB88_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB88_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB88_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB88_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB88_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB88_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB88_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB88_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB88_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB88_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB88_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB88_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %rem = and i32 %c, 31
@@ -5685,54 +5239,30 @@
 define i32 @atomic_shl1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB90_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB90_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB90_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB90_5
-; X86-NEXT:  .LBB90_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB90_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB90_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB90_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB90_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB90_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB90_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB90_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB90_3:
+; X64-NEXT:  .LBB90_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5756,56 +5286,28 @@
 define i32 @atomic_shl1_small_mask_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB91_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB91_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB91_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB91_5
-; X86-NEXT:  .LBB91_3:
+; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    jae .LBB91_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB91_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB91_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB91_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB91_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB91_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    jae .LBB91_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB91_3:
+; X64-NEXT:  .LBB91_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5830,54 +5332,30 @@
 define i32 @atomic_shl1_mask0_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB92_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB92_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB92_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB92_5
-; X86-NEXT:  .LBB92_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB92_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB92_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB92_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB92_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB92_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB92_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB92_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB92_3:
+; X64-NEXT:  .LBB92_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5903,54 +5381,30 @@
 define i32 @atomic_shl1_mask1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB93_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB93_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB93_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB93_5
-; X86-NEXT:  .LBB93_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB93_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB93_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB93_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB93_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB93_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB93_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB93_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB93_3:
+; X64-NEXT:  .LBB93_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5973,57 +5427,33 @@
   ret i32 %retval.0
 }
 
-define i32 @atomic_shl1_mask01_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
-; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB94_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB94_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB94_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB94_5
-; X86-NEXT:  .LBB94_3:
+define i32 @atomic_shl1_mask01_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
+; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB94_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB94_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB94_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB94_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB94_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB94_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB94_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB94_3:
+; X64-NEXT:  .LBB94_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -7430,19 +6860,12 @@
 ; X64-LABEL: atomic_shl1_xor_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB122_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB122_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -7625,20 +7048,12 @@
 ; X64-LABEL: atomic_shl1_small_mask_xor_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andb $31, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB125_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB125_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 31
@@ -7701,21 +7116,12 @@
 ; X64-LABEL: atomic_shl1_mask0_xor_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB126_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    xorq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB126_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -7779,21 +7185,12 @@
 ; X64-LABEL: atomic_shl1_mask1_xor_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB127_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    xorq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB127_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -7846,19 +7243,12 @@
 ; X64-LABEL: atomic_shl1_mask01_xor_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB128_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB128_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -9182,21 +8572,12 @@
 ; X64-LABEL: atomic_shl1_and_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB146_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    andq %rsi, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB146_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -9401,22 +8782,12 @@
 ; X64-LABEL: atomic_shl1_small_mask_and_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andb $31, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB149_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    andq %rsi, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB149_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 31
@@ -9482,21 +8853,12 @@
 ; X64-LABEL: atomic_shl1_mask0_and_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq $-2, %rdx
-; X64-NEXT:    rolq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB150_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    andq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB150_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -9563,21 +8925,12 @@
 ; X64-LABEL: atomic_shl1_mask1_and_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq $-2, %rdx
-; X64-NEXT:    rolq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB151_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    andq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB151_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -9637,21 +8990,12 @@
 ; X64-LABEL: atomic_shl1_mask01_and_64_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
+; X64-NEXT:    andl $63, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq %rcx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB152_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    andq %rsi, %rcx
-; X64-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; X64-NEXT:    jne .LBB152_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -10426,26 +9770,14 @@
 ;
 ; X64-LABEL: atomic_shl1_and_64_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB162_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    andq %rsi, %r8
-; X64-NEXT:    lock cmpxchgq %r8, (%rdi)
-; X64-NEXT:    jne .LBB162_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testq %rdx, %rax
-; X64-NEXT:    je .LBB162_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movq (%rdi,%rcx,8), %rax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $63, %eax
+; X64-NEXT:    lock btrq %rax, (%rdi)
+; X64-NEXT:    jae .LBB162_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movq (%rdi,%rsi,8), %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB162_3:
+; X64-NEXT:  .LBB162_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -10725,27 +10057,13 @@
 ;
 ; X64-LABEL: atomic_shl1_small_mask_and_64_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andl $31, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB165_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    andq %rsi, %r8
-; X64-NEXT:    lock cmpxchgq %r8, (%rdi)
-; X64-NEXT:    jne .LBB165_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB165_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movq (%rdi,%rcx,8), %rax
+; X64-NEXT:    andl $31, %esi
+; X64-NEXT:    lock btrq %rsi, (%rdi)
+; X64-NEXT:    jae .LBB165_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movq (%rdi,%rsi,8), %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB165_3:
+; X64-NEXT:  .LBB165_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -10832,24 +10150,14 @@
 ;
 ; X64-LABEL: atomic_shl1_mask0_and_64_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq $-2, %rdx
-; X64-NEXT:    rolq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB166_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    andq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB166_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btq %rcx, %rax
-; X64-NEXT:    jae .LBB166_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movq (%rdi,%rcx,8), %rax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $63, %eax
+; X64-NEXT:    lock btrq %rax, (%rdi)
+; X64-NEXT:    jae .LBB166_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movq (%rdi,%rsi,8), %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB166_3:
+; X64-NEXT:  .LBB166_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -10937,24 +10245,14 @@
 ;
 ; X64-LABEL: atomic_shl1_mask1_and_64_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq $-2, %rdx
-; X64-NEXT:    rolq %cl, %rdx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB167_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    andq %rdx, %rsi
-; X64-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; X64-NEXT:    jne .LBB167_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btq %rcx, %rax
-; X64-NEXT:    jae .LBB167_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movq (%rdi,%rcx,8), %rax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $63, %eax
+; X64-NEXT:    lock btrq %rax, (%rdi)
+; X64-NEXT:    jae .LBB167_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movq (%rdi,%rsi,8), %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB167_3:
+; X64-NEXT:  .LBB167_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -11035,26 +10333,14 @@
 ;
 ; X64-LABEL: atomic_shl1_mask01_and_64_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq $-2, %rsi
-; X64-NEXT:    rolq %cl, %rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB168_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    andq %rsi, %r8
-; X64-NEXT:    lock cmpxchgq %r8, (%rdi)
-; X64-NEXT:    jne .LBB168_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testq %rdx, %rax
-; X64-NEXT:    je .LBB168_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movq (%rdi,%rcx,8), %rax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $63, %eax
+; X64-NEXT:    lock btrq %rax, (%rdi)
+; X64-NEXT:    jae .LBB168_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movq (%rdi,%rsi,8), %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB168_3:
+; X64-NEXT:  .LBB168_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry: