diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -25,6 +25,7 @@
 
 set(sources
   X86AsmPrinter.cpp
+  X86AtomicANDHoist.cpp
   X86AvoidTrailingCall.cpp
   X86CallFrameOptimization.cpp
   X86CallingConv.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -145,6 +145,10 @@
 /// ways.
 FunctionPass *createX86PartialReductionPass();
 
+/// This pass hoists AND instruction that uses a atomic logic instruction to
+/// the same basic block so that ISel has the ability to optimize them.
+FunctionPass *createX86AtomicANDHoistPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
@@ -159,6 +163,7 @@
 void initializeFixupLEAPassPass(PassRegistry &);
 void initializeFPSPass(PassRegistry &);
 void initializeWinEHStatePassPass(PassRegistry &);
+void initializeX86AtomicANDHoistPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
 void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86AtomicANDHoist.cpp b/llvm/lib/Target/X86/X86AtomicANDHoist.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/X86/X86AtomicANDHoist.cpp
@@ -0,0 +1,66 @@
+//===- X86AtomicANDHoist.ll - Hoist AND instrution uses atomic logic one --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-atomic-and-hoist"
+
+namespace {
+
+class X86AtomicANDHoist : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86AtomicANDHoist() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "X86 Atomic AND Hoist"; }
+};
+} // namespace
+
+FunctionPass *llvm::createX86AtomicANDHoistPass() {
+  return new X86AtomicANDHoist();
+}
+
+char X86AtomicANDHoist::ID = 0;
+
+INITIALIZE_PASS(X86AtomicANDHoist, DEBUG_TYPE, "X86 Atomic AND Hoist", false,
+                false)
+
+bool X86AtomicANDHoist::runOnFunction(Function &F) {
+  bool Changed = false;
+
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *AI = dyn_cast<AtomicRMWInst>(&I);
+      if (!AI || !AI->hasOneUse())
+        continue;
+      Instruction *And = AI->user_back();
+      if (And->getOpcode() != Instruction::And || And->getParent() == &BB ||
+          !isa<ConstantInt>(And->getOperand(1)))
+        continue;
+      AtomicRMWInst::BinOp Op = AI->getOperation();
+      if (Op != AtomicRMWInst::Or && Op != AtomicRMWInst::And &&
+          Op != AtomicRMWInst::Xor)
+        continue;
+      Changed = true;
+      And->moveAfter(AI);
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -790,6 +790,9 @@
     LOR,
     LXOR,
     LAND,
+    LBTS,
+    LBTC,
+    LBTR,
 
     // Load, scalar_to_vector, and zero extend.
     VZEXT_LOAD,
@@ -1640,6 +1643,8 @@
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLogicRMWInIR(AtomicRMWInst *AI) const;
 
     LoadInst *
     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5685,7 +5685,12 @@
 
 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
-  return true;
+  auto *AI = dyn_cast<AtomicRMWInst>(AndI.getOperand(0));
+  if (!AI)
+    return true;
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  return Op != AtomicRMWInst::Or && Op != AtomicRMWInst::And &&
+         Op != AtomicRMWInst::Xor;
 }
 
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
@@ -30313,6 +30318,33 @@
                                  : AtomicExpansionKind::None;
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLogicRMWInIR(AtomicRMWInst *AI) const {
+  // If the atomicrmw's result isn't actually used, we can just add a "lock"
+  // prefix to a normal instruction for these operations.
+  if (AI->use_empty())
+    return AtomicExpansionKind::None;
+
+  // If the atomicrmw's result is used by a single bit AND, we may use
+  // bts/btr/btc instruction for these operations.
+  auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+  Instruction *I = AI->user_back();
+  if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+      AI->getParent() != I->getParent())
+    return AtomicExpansionKind::CmpXChg;
+  // The following instruction must be a AND single bit.
+  auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+  unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
+  if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+    return AtomicExpansionKind::CmpXChg;
+
+  if (AI->getOperation() == AtomicRMWInst::And)
+    return ~C1->getValue() == C2->getValue() ? AtomicExpansionKind::None
+                                             : AtomicExpansionKind::CmpXChg;
+
+  return C1 == C2 ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+}
+
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
@@ -30337,10 +30369,7 @@
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
-    // If the atomicrmw's result isn't actually used, we can just add a "lock"
-    // prefix to a normal instruction for these operations.
-    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
-                            : AtomicExpansionKind::None;
+    return shouldExpandAtomicLogicRMWInIR(AI);
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
@@ -31038,6 +31067,40 @@
       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
                            RHS, AN->getMemOperand());
     }
+    if (Opc == ISD::ATOMIC_LOAD_OR || Opc == ISD::ATOMIC_LOAD_XOR ||
+        Opc == ISD::ATOMIC_LOAD_AND) {
+      ConstantSDNode *C1 = cast<ConstantSDNode>(RHS);
+      ConstantSDNode *C2 = nullptr;
+      for (auto UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) {
+        // Skip uses of the chain value. Result 0 of the node is the load value.
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (C2 || UI->getOpcode() != ISD::AND)
+          report_fatal_error("Atomic result must be used by one AND");
+        C2 = cast<ConstantSDNode>(UI->getOperand(1));
+        assert(isPowerOf2_64(C2->getZExtValue()) && "Must be power of 2 value");
+      }
+      if (Opc == ISD::ATOMIC_LOAD_AND) {
+        assert(~C1->getAPIntValue() == C2->getAPIntValue() &&
+               "Cannot lower to BTR");
+        Opc = X86ISD::LBTR;
+      } else {
+        assert(C1 == C2 && "Cannot lower to BTS/BTC");
+        Opc = Opc == ISD::ATOMIC_LOAD_OR ? X86ISD::LBTS : X86ISD::LBTC;
+      }
+
+      unsigned Imm = countTrailingZeros(C2->getZExtValue());
+      MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+      SDValue Res = DAG.getMemIntrinsicNode(
+          Opc, DL, DAG.getVTList(VT, MVT::Other),
+          {Chain, LHS, DAG.getConstant(Imm, DL, VT)}, VT, MMO);
+      Chain = Res.getValue(1);
+      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+      if (Imm)
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getShiftAmountConstant(Imm, VT, DL));
+      return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Res, Chain);
+    }
     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
            "Used AtomicRMW ops other than Add should have been expanded!");
     return N;
@@ -32821,6 +32884,9 @@
   NODE_NAME_CASE(LOR)
   NODE_NAME_CASE(LXOR)
   NODE_NAME_CASE(LAND)
+  NODE_NAME_CASE(LBTS)
+  NODE_NAME_CASE(LBTC)
+  NODE_NAME_CASE(LBTR)
   NODE_NAME_CASE(VZEXT_MOVL)
   NODE_NAME_CASE(VZEXT_LOAD)
   NODE_NAME_CASE(VEXTRACT_STORE)
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -839,6 +839,28 @@
   def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
 }
 
+multiclass ATOMIC_LOG<Format Form, string mnemonic, SDPatternOperator frag> {
+  let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+      SchedRW = [WriteBitTestSetRegRMW]  in {
+    def 16m : Ii8<0xBA, Form, (outs), (ins i16mem:$src1, i16imm:$src2),
+                  !strconcat(mnemonic, "{w}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (frag addr:$src1, (i16 imm:$src2)))]>,
+              OpSize16, TB, LOCK;
+    def 32m : Ii8<0xBA, Form, (outs), (ins i32mem:$src1, i32imm:$src2),
+                  !strconcat(mnemonic, "{l}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (frag addr:$src1, (i32 imm:$src2)))]>,
+              OpSize32, TB, LOCK;
+    def 64m : RIi8<0xBA, Form, (outs), (ins i64mem:$src1, i64imm:$src2),
+                   !strconcat(mnemonic, "{q}\t{$src2, $src1|$src1, $src2}"),
+                   [(set EFLAGS, (frag addr:$src1, (i64 imm:$src2)))]>,
+              TB, LOCK;
+  }
+}
+
+defm LOCK_BTS : ATOMIC_LOG<MRM5m, "bts", X86lock_bts>;
+defm LOCK_BTR : ATOMIC_LOG<MRM6m, "btr", X86lock_btr>;
+defm LOCK_BTC : ATOMIC_LOG<MRM7m, "btc", X86lock_btc>;
+
 // Atomic compare and swap.
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -282,6 +282,15 @@
 def X86lock_and  : SDNode<"X86ISD::LAND",  SDTLockBinaryArithWithFlags,
                           [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                            SDNPMemOperand]>;
+def X86lock_bts  : SDNode<"X86ISD::LBTS",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_btc  : SDNode<"X86ISD::LBTC",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_btr  : SDNode<"X86ISD::LBTR",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
 
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -485,6 +485,7 @@
 }
 
 bool X86PassConfig::addPreISel() {
+  addPass(createX86AtomicANDHoistPass());
   // Only add this pass for 32-bit x86 Windows.
   const Triple &TT = TM->getTargetTriple();
   if (TT.isOSWindows() && TT.getArch() == Triple::x86)
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -31,6 +31,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       X86 Atomic AND Hoist
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -9,35 +9,17 @@
 define i16 @bts1() nounwind {
 ; X86-LABEL: bts1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB0_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB0_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB0_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB0_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -49,35 +31,19 @@
 define i16 @bts2() nounwind {
 ; X86-LABEL: bts2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $2, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB1_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB1_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $2, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB1_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -89,35 +55,19 @@
 define i16 @bts15() nounwind {
 ; X86-LABEL: bts15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB2_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB2_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $32768, %ecx # imm = 0x8000
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB2_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -129,30 +79,18 @@
 define i32 @bts31() nounwind {
 ; X86-LABEL: bts31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB3_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB3_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $-2147483648, %ecx # imm = 0x80000000
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB3_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw or i32* @v32, i32 2147483648 monotonic, align 4
@@ -185,17 +123,10 @@
 ;
 ; X64-LABEL: bts63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB4_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB4_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw or i64* @v64, i64 -9223372036854775808 monotonic, align 8
@@ -206,35 +137,17 @@
 define i16 @btc1() nounwind {
 ; X86-LABEL: btc1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB5_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB5_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB5_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -246,35 +159,19 @@
 define i16 @btc2() nounwind {
 ; X86-LABEL: btc2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB6_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $2, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB6_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB6_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $2, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB6_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -286,35 +183,19 @@
 define i16 @btc15() nounwind {
 ; X86-LABEL: btc15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB7_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB7_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB7_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $32768, %ecx # imm = 0x8000
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB7_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -326,30 +207,18 @@
 define i32 @btc31() nounwind {
 ; X86-LABEL: btc31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB8_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB8_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB8_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB8_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw xor i32* @v32, i32 2147483648 monotonic, align 4
@@ -382,17 +251,10 @@
 ;
 ; X64-LABEL: btc63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB9_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB9_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw xor i64* @v64, i64 -9223372036854775808 monotonic, align 8
@@ -403,35 +265,17 @@
 define i16 @btr1() nounwind {
 ; X86-LABEL: btr1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB10_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $65534, %ecx # imm = 0xFFFE
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB10_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB10_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $65534, %ecx # imm = 0xFFFE
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB10_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -443,35 +287,19 @@
 define i16 @btr2() nounwind {
 ; X86-LABEL: btr2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB11_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $65533, %ecx # imm = 0xFFFD
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB11_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB11_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $65533, %ecx # imm = 0xFFFD
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB11_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -483,35 +311,19 @@
 define i16 @btr15() nounwind {
 ; X86-LABEL: btr15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB12_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB12_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB12_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB12_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -523,30 +335,18 @@
 define i32 @btr31() nounwind {
 ; X86-LABEL: btr31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB13_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB13_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB13_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB13_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw and i32* @v32, i32 2147483647 monotonic, align 4
@@ -585,18 +385,10 @@
 ;
 ; X64-LABEL: btr63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB14_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB14_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    incq %rcx
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw and i64* @v64, i64 9223372036854775807 monotonic, align 8
@@ -655,36 +447,18 @@
 define i16 @multi_use2() nounwind {
 ; X86-LABEL: multi_use2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB16_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB16_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: multi_use2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB16_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $rax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $rax
-; X64-NEXT:    jne .LBB16_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
@@ -764,39 +538,23 @@
 define void @no_and_cmp0_fold() nounwind {
 ; X86-LABEL: no_and_cmp0_fold:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB18_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $8, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB18_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    lock btsl $3, v32
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB18_3
-; X86-NEXT:  # %bb.4: # %if.end
+; X86-NEXT:    je .LBB18_1
+; X86-NEXT:  # %bb.2: # %if.end
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %if.then
+; X86-NEXT:  .LBB18_1: # %if.then
 ;
 ; X64-LABEL: no_and_cmp0_fold:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB18_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $8, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    lock btsl $3, v32(%rip)
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testb %al, %al
-; X64-NEXT:    je .LBB18_3
-; X64-NEXT:  # %bb.4: # %if.end
+; X64-NEXT:    je .LBB18_1
+; X64-NEXT:  # %bb.2: # %if.end
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %if.then
+; X64-NEXT:  .LBB18_1: # %if.then
 entry:
   %0 = atomicrmw or i32* @v32, i32 8 monotonic, align 4
   %and = and i32 %0, 8
@@ -815,32 +573,20 @@
 ; X86-LABEL: split_hoist_and:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB19_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl $8, %edx
-; X86-NEXT:    lock cmpxchgl %edx, v32
-; X86-NEXT:    jne .LBB19_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl $3, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    andl $8, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: split_hoist_and:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB19_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $8, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB19_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl $3, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $3, %eax
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    andl $8, %eax
 ; X64-NEXT:    retq
   %2 = atomicrmw or i32* @v32, i32 8 monotonic, align 4
   %3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false)
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -67,6 +67,7 @@
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       X86 Atomic AND Hoist
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier