diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -9,7 +9,8 @@
 // This file exposes a function named BuildMI, which is useful for dramatically
 // simplifying how MachineInstr's are created.  It allows use of code like this:
 //
-//   M = BuildMI(MBB, MI, DL, TII.get(X86::ADD8rr), Dst)
+//   MIMetadata MIMD(MI);  // Propagates DebugLoc and other metadata
+//   M = BuildMI(MBB, MI, MIMD, TII.get(X86::ADD8rr), Dst)
 //           .addReg(argVal1)
 //           .addReg(argVal2);
 //
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2478,10 +2478,12 @@
 
     // If this is not a fall-through branch or optimizations are switched off,
     // emit the branch.
-    if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
-                              MVT::Other, getControlRoot(),
-                              DAG.getBasicBlock(Succ0MBB)));
+    if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None) {
+      auto Br = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                            getControlRoot(), DAG.getBasicBlock(Succ0MBB));
+      setValue(&I, Br);
+      DAG.setRoot(Br);
+    }
 
     return;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36263,7 +36263,7 @@
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
@@ -36313,28 +36313,28 @@
   //  xbegin fallMBB
   //  # fallthrough to mainMBB
   //  # abortion to fallMBB
-  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
+  BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(fallMBB);
 
   // mainMBB:
   //  mainDstReg := -1
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
-  BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+  BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+  BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   mainMBB->addSuccessor(sinkMBB);
 
   // fallMBB:
   //  ; pseudo instruction to model hardware's definition from XABORT
   //  EAX := XABORT_DEF
   //  fallDstReg := EAX
-  BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
-  BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
+  BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
+  BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
       .addReg(X86::EAX);
   fallMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+  BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
       .addReg(mainDstReg).addMBB(mainMBB)
       .addReg(fallDstReg).addMBB(fallMBB);
 
@@ -36387,7 +36387,7 @@
   const TargetRegisterClass *AddrRegClass =
       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   // struct va_list {
   //   i32   gp_offset
@@ -36471,7 +36471,7 @@
 
     // Load the offset value into a register
     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
-    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+    BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
         .add(Base)
         .add(Scale)
         .add(Index)
@@ -36480,13 +36480,13 @@
         .setMemRefs(LoadOnlyMMO);
 
     // Check if there is enough room left to pull this argument.
-    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+    BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
       .addReg(OffsetReg)
       .addImm(MaxOffset + 8 - ArgSizeA8);
 
     // Branch to "overflowMBB" if offset >= max
     // Fall through to "offsetMBB" otherwise
-    BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
+    BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
       .addMBB(overflowMBB).addImm(X86::COND_AE);
   }
 
@@ -36497,7 +36497,7 @@
     // Read the reg_save_area address.
     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(
-        offsetMBB, DL,
+        offsetMBB, MIMD,
         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
         RegSaveReg)
         .add(Base)
@@ -36510,30 +36510,30 @@
     if (Subtarget.isTarget64BitLP64()) {
       // Zero-extend the offset
       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
-      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+      BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
           .addImm(0)
           .addReg(OffsetReg)
           .addImm(X86::sub_32bit);
 
       // Add the offset to the reg_save_area to get the final address.
-      BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
           .addReg(OffsetReg64)
           .addReg(RegSaveReg);
     } else {
       // Add the offset to the reg_save_area to get the final address.
-      BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
+      BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
           .addReg(OffsetReg)
           .addReg(RegSaveReg);
     }
 
     // Compute the offset for the next argument
     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+    BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
       .addReg(OffsetReg)
       .addImm(UseFPOffset ? 16 : 8);
 
     // Store it back into the va_list.
-    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+    BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
         .add(Base)
         .add(Scale)
         .add(Index)
@@ -36543,7 +36543,7 @@
         .setMemRefs(StoreOnlyMMO);
 
     // Jump to endMBB
-    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
       .addMBB(endMBB);
   }
 
@@ -36553,7 +36553,7 @@
 
   // Load the overflow_area address into a register.
   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL,
+  BuildMI(overflowMBB, MIMD,
           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
           OverflowAddrReg)
       .add(Base)
@@ -36571,20 +36571,20 @@
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
     BuildMI(
-        overflowMBB, DL,
+        overflowMBB, MIMD,
         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
         TmpReg)
         .addReg(OverflowAddrReg)
         .addImm(Alignment.value() - 1);
 
     BuildMI(
-        overflowMBB, DL,
+        overflowMBB, MIMD,
         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
         OverflowDestReg)
         .addReg(TmpReg)
         .addImm(~(uint64_t)(Alignment.value() - 1));
   } else {
-    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+    BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
       .addReg(OverflowAddrReg);
   }
 
@@ -36592,14 +36592,14 @@
   // (the overflow address should be kept 8-byte aligned)
   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(
-      overflowMBB, DL,
+      overflowMBB, MIMD,
       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
       NextAddrReg)
       .addReg(OverflowDestReg)
       .addImm(ArgSizeA8);
 
   // Store the new overflow address.
-  BuildMI(overflowMBB, DL,
+  BuildMI(overflowMBB, MIMD,
           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
       .add(Base)
       .add(Scale)
@@ -36611,7 +36611,7 @@
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
-    BuildMI(*endMBB, endMBB->begin(), DL,
+    BuildMI(*endMBB, endMBB->begin(), MIMD,
             TII->get(X86::PHI), DestReg)
       .addReg(OffsetDestReg).addMBB(offsetMBB)
       .addReg(OverflowDestReg).addMBB(overflowMBB);
@@ -36688,7 +36688,7 @@
     MachineBasicBlock *SinkMBB) {
   MachineFunction *MF = TrueMBB->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  const DebugLoc &DL = MIItBegin->getDebugLoc();
+  const MIMetadata MIMD(*MIItBegin);
 
   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
@@ -36721,11 +36721,12 @@
     if (RegRewriteTable.contains(Op2Reg))
       Op2Reg = RegRewriteTable[Op2Reg].second;
 
-    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
-              .addReg(Op1Reg)
-              .addMBB(FalseMBB)
-              .addReg(Op2Reg)
-              .addMBB(TrueMBB);
+    MIB =
+        BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
+            .addReg(Op1Reg)
+            .addMBB(FalseMBB)
+            .addReg(Op2Reg)
+            .addMBB(TrueMBB);
 
     // Add this PHI to the rewrite table.
     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
@@ -36740,7 +36741,7 @@
                                              MachineInstr &SecondCascadedCMOV,
                                              MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const DebugLoc &DL = FirstCMOV.getDebugLoc();
+  const MIMetadata MIMD(FirstCMOV);
 
   // We lower cascaded CMOVs such as
   //
@@ -36857,11 +36858,13 @@
 
   // Create the conditional branch instructions.
   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
-  BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
+  BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
 
   X86::CondCode SecondCC =
       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
-  BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
+  BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(SinkMBB)
+      .addImm(SecondCC);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
@@ -36869,7 +36872,7 @@
   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
   MachineInstrBuilder MIB =
-      BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
+      BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
           .addReg(Op1Reg)
           .addMBB(SecondInsertedMBB)
           .addReg(Op2Reg)
@@ -36890,7 +36893,7 @@
 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
                                      MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -37008,7 +37011,7 @@
   FalseMBB->addSuccessor(SinkMBB);
 
   // Create the conditional branch instruction.
-  BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
+  BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
@@ -37037,7 +37040,7 @@
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
 
   const unsigned ProbeSize = getStackProbeSize(*MF);
@@ -37061,23 +37064,23 @@
   Register FinalStackPtr = MRI.createVirtualRegister(
       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
 
-  BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+  BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
       .addReg(physSPReg);
   {
     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
-    BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+    BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
         .addReg(TmpStackPtr)
         .addReg(sizeVReg);
   }
 
   // test rsp size
 
-  BuildMI(testMBB, DL,
+  BuildMI(testMBB, MIMD,
           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
       .addReg(FinalStackPtr)
       .addReg(physSPReg);
 
-  BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+  BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
       .addMBB(tailMBB)
       .addImm(X86::COND_GE);
   testMBB->addSuccessor(blockMBB);
@@ -37097,20 +37100,20 @@
 
   const unsigned XORMIOpc =
       TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
-  addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
+  addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
       .addImm(0);
 
-  BuildMI(blockMBB, DL,
-          TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)), physSPReg)
+  BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
+          physSPReg)
       .addReg(physSPReg)
       .addImm(ProbeSize);
 
-
-  BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+  BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
   blockMBB->addSuccessor(testMBB);
 
   // Replace original instruction by the expected stack ptr
-  BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+  BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
+          MI.getOperand(0).getReg())
       .addReg(FinalStackPtr);
 
   tailMBB->splice(tailMBB->end(), MBB,
@@ -37130,7 +37133,7 @@
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
@@ -37185,58 +37188,58 @@
 
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
-  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+  BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
     .addReg(tmpSPVReg).addReg(sizeVReg);
-  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
     .addReg(SPLimitVReg);
-  BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
+  BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
-  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+  BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
     .addReg(SPLimitVReg);
-  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+  BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
     .addReg(SPLimitVReg);
-  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+  BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
-    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
-    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+    BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
   } else if (Is64Bit) {
-    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
       .addReg(sizeVReg);
-    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+    BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EDI, RegState::Implicit)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
-    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
       .addImm(12);
-    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
-    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+    BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
-    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
       .addImm(16);
 
-  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+  BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
     .addReg(IsLP64 ? X86::RAX : X86::EAX);
-  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+  BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
   BB->addSuccessor(bumpMBB);
@@ -37245,7 +37248,7 @@
   bumpMBB->addSuccessor(continueMBB);
 
   // Take care of the PHI nodes.
-  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+  BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
           MI.getOperand(0).getReg())
       .addReg(mallocPtrVReg)
       .addMBB(mallocMBB)
@@ -37265,7 +37268,7 @@
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   assert(!isAsynchronousEHPersonality(
              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
@@ -37290,7 +37293,7 @@
   RestoreMBB->setIsEHPad(true);
 
   auto RestoreMBBI = RestoreMBB->begin();
-  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   return BB;
 }
 
@@ -37303,13 +37306,13 @@
   // inside MC, therefore without the two markers shrink-wrapping
   // may push the prologue/epilogue pass them.
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
-    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
+      BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
@@ -37317,7 +37320,7 @@
   // original instruction around.
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   MachineInstrBuilder CallseqEnd =
-    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
+      BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
 
   return BB;
@@ -37332,7 +37335,7 @@
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI.getOperand(3).isGlobal() && "This should be a global");
@@ -37346,38 +37349,38 @@
       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   if (Subtarget.is64Bit()) {
     MachineInstrBuilder MIB =
-        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
             .addReg(X86::RIP)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else if (!isPositionIndependent()) {
     MachineInstrBuilder MIB =
-        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(0)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else {
     MachineInstrBuilder MIB =
-        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(TII->getGlobalBaseReg(F))
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   }
@@ -37471,7 +37474,7 @@
                                             MachineBasicBlock *BB) const {
   // Copy the virtual register into the R11 physical register and
   // call the retpoline thunk.
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   Register CalleeVReg = MI.getOperand(0).getReg();
   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
@@ -37510,7 +37513,7 @@
 
   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
 
-  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+  BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
       .addReg(CalleeVReg);
   MI.getOperand(0).ChangeToES(Symbol);
   MI.setDesc(TII->get(Opc));
@@ -37533,7 +37536,7 @@
 /// \param [in] MBB The Machine Basic Block that will be modified.
 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -37548,7 +37551,7 @@
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   Register ZReg = MRI.createVirtualRegister(PtrRC);
   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
-  BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
+  BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
       .addDef(ZReg)
       .addReg(ZReg, RegState::Undef)
       .addReg(ZReg, RegState::Undef);
@@ -37556,11 +37559,11 @@
   // Read the current SSP Register value to the zeroed register.
   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
-  BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+  BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
   // Write the SSP register value to offset 3 in input memory buffer.
   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
+  MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
   const int64_t SSPOffset = 3 * PVT.getStoreSize();
   const unsigned MemOpndSlot = 1;
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
@@ -37576,7 +37579,7 @@
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -37652,7 +37655,7 @@
     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
     LabelReg = MRI.createVirtualRegister(PtrRC);
     if (Subtarget.is64Bit()) {
-      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+      MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
               .addReg(X86::RIP)
               .addImm(0)
               .addReg(0)
@@ -37660,7 +37663,7 @@
               .addReg(0);
     } else {
       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
-      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+      MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
               .addReg(XII->getGlobalBaseReg(MF))
               .addImm(0)
               .addReg(0)
@@ -37670,7 +37673,7 @@
   } else
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   // Store IP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
@@ -37688,7 +37691,7 @@
   }
 
   // Setup
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -37698,14 +37701,15 @@
 
   // mainMBB:
   //  EAX = 0
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(X86::PHI), DstReg)
-    .addReg(mainDstReg).addMBB(mainMBB)
-    .addReg(restoreDstReg).addMBB(restoreMBB);
+  BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
+      .addReg(mainDstReg)
+      .addMBB(mainMBB)
+      .addReg(restoreDstReg)
+      .addMBB(restoreMBB);
 
   // restoreMBB:
   if (RegInfo->hasBasePointer(*MF)) {
@@ -37716,12 +37720,12 @@
     Register FramePtr = RegInfo->getFrameRegister(*MF);
     Register BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
-    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
       .setMIFlag(MachineInstr::FrameSetup);
   }
-  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
-  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+  BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   restoreMBB->addSuccessor(sinkMBB);
 
   MI.eraseFromParent();
@@ -37736,7 +37740,7 @@
 MachineBasicBlock *
 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -37797,11 +37801,11 @@
 
   // Initialize a register with zero.
   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-  BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
+  BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
 
   if (PVT == MVT::i64) {
     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
-    BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
+    BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
       .addImm(0)
       .addReg(ZReg)
       .addImm(X86::sub_32bit);
@@ -37811,15 +37815,17 @@
   // Read the current SSP Register value to the zeroed register.
   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
-  BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+  BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
   // Check whether the result of the SSP register is zero and jump directly
   // to the sink.
   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
-  BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
+  BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
       .addReg(SSPCopyReg)
       .addReg(SSPCopyReg);
-  BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
+  BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(sinkMBB)
+      .addImm(X86::COND_E);
   checkSspMBB->addSuccessor(sinkMBB);
   checkSspMBB->addSuccessor(fallMBB);
 
@@ -37828,7 +37834,7 @@
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   const int64_t SPPOffset = 3 * PVT.getStoreSize();
   MachineInstrBuilder MIB =
-      BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
+      BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (i == X86::AddrDisp)
@@ -37844,12 +37850,14 @@
   // Subtract the current SSP from the previous SSP.
   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
-  BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
+  BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
       .addReg(PrevSSPReg)
       .addReg(SSPCopyReg);
 
   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
-  BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
+  BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(sinkMBB)
+      .addImm(X86::COND_BE);
   fallMBB->addSuccessor(sinkMBB);
   fallMBB->addSuccessor(fixShadowMBB);
 
@@ -37857,36 +37865,38 @@
   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
-  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
+  BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
       .addReg(SspSubReg)
       .addImm(Offset);
 
   // Increase SSP when looking only on the lower 8 bits of the delta.
   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
-  BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
+  BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
 
   // Reset the lower 8 bits.
   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
-  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
+  BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
       .addReg(SspFirstShrReg)
       .addImm(8);
 
   // Jump if the result of the shift is zero.
-  BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
+  BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(sinkMBB)
+      .addImm(X86::COND_E);
   fixShadowMBB->addSuccessor(sinkMBB);
   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
 
   // Do a single shift left.
   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
-  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
+  BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
       .addReg(SspSecondShrReg)
       .addImm(1);
 
   // Save the value 128 to a register (will be used next with incssp).
   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
-  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
+  BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
       .addImm(128);
   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
 
@@ -37894,21 +37904,23 @@
   // iterations of incssp until we finish fixing the shadow stack.
   Register DecReg = MRI.createVirtualRegister(PtrRC);
   Register CounterReg = MRI.createVirtualRegister(PtrRC);
-  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
+  BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
       .addReg(SspAfterShlReg)
       .addMBB(fixShadowLoopPrepareMBB)
       .addReg(DecReg)
       .addMBB(fixShadowLoopMBB);
 
   // Every iteration we increase the SSP by 128.
-  BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
+  BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
 
   // Every iteration we decrement the counter by 1.
   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
-  BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
+  BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
 
   // Jump if the counter is not zero yet.
-  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
+  BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(fixShadowLoopMBB)
+      .addImm(X86::COND_NE);
   fixShadowLoopMBB->addSuccessor(sinkMBB);
   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
 
@@ -37918,7 +37930,7 @@
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -37955,7 +37967,7 @@
   }
 
   // Reload FP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg()) // Don't add the whole operand, we don't want to
@@ -37967,7 +37979,7 @@
   MIB.setMemRefs(MMOs);
 
   // Reload IP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (i == X86::AddrDisp)
@@ -37981,7 +37993,7 @@
   MIB.setMemRefs(MMOs);
 
   // Reload SP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
@@ -37992,7 +38004,7 @@
   MIB.setMemRefs(MMOs);
 
   // Jump
-  BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+  BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
 
   MI.eraseFromParent();
   return thisMBB;
@@ -38002,7 +38014,7 @@
                                                MachineBasicBlock *MBB,
                                                MachineBasicBlock *DispatchBB,
                                                int FI) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -38025,14 +38037,14 @@
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
     if (Subtarget.is64Bit())
-      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+      BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB)
           .addReg(0);
     else
-      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+      BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
           .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
@@ -38040,7 +38052,7 @@
           .addReg(0);
   }
 
-  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+  MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
   if (UseImmLabel)
     MIB.addMBB(DispatchBB);
@@ -38051,7 +38063,7 @@
 MachineBasicBlock *
 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -38106,7 +38118,7 @@
   DispatchBB->setIsEHPad(true);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
-  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+  BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
@@ -38138,36 +38150,38 @@
     Register FP = RI.getFrameRegister(*MF);
     Register BP = RI.getBaseRegister();
     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
-    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+    addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
                  MFI->getRestoreBasePointerOffset())
         .addRegMask(RI.getNoPreservedMask());
   } else {
-    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+    BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
         .addRegMask(RI.getNoPreservedMask());
   }
 
   // IReg is used as an index in a memory operand and therefore can't be SP
   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
-  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+  addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
                     Subtarget.is64Bit() ? 8 : 4);
-  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+  BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
       .addReg(IReg)
       .addImm(LPadList.size());
-  BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
+  BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
+      .addMBB(TrapBB)
+      .addImm(X86::COND_AE);
 
   if (Subtarget.is64Bit()) {
     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
 
     // leaq .LJTI0_0(%rip), BReg
-    BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
+    BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
         .addReg(X86::RIP)
         .addImm(1)
         .addReg(0)
         .addJumpTableIndex(MJTI)
         .addReg(0);
     // movzx IReg64, IReg
-    BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
+    BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
         .addImm(0)
         .addReg(IReg)
         .addImm(X86::sub_32bit);
@@ -38175,7 +38189,7 @@
     switch (JTE) {
     case MachineJumpTableInfo::EK_BlockAddress:
       // jmpq *(BReg,IReg64,8)
-      BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
+      BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
           .addReg(BReg)
           .addImm(8)
           .addReg(IReg64)
@@ -38188,20 +38202,21 @@
       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
 
       // movl (BReg,IReg64,4), OReg
-      BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
+      BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
           .addReg(BReg)
           .addImm(4)
           .addReg(IReg64)
           .addImm(0)
           .addReg(0);
       // movsx OReg64, OReg
-      BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
+      BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
+          .addReg(OReg);
       // addq BReg, OReg64, TReg
-      BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
+      BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
           .addReg(OReg64)
           .addReg(BReg);
       // jmpq *TReg
-      BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
+      BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
       break;
     }
     default:
@@ -38209,7 +38224,7 @@
     }
   } else {
     // jmpl *.LJTI0_0(,IReg,4)
-    BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
+    BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
         .addReg(0)
         .addImm(4)
         .addReg(IReg)
@@ -38281,7 +38296,7 @@
                                                MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
+  const MIMetadata MIMD(MI);
 
   auto TMMImmToTMMReg = [](unsigned Imm) {
     assert (Imm < 8 && "Illegal tmm index");
@@ -38345,46 +38360,46 @@
     // precision when performing the addition.
     int OrigCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
                       OrigCWFrameIdx);
 
     // Load the old value of the control word...
     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
                       OrigCWFrameIdx);
 
     // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
     // precision.
     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-    BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+    BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
         .addReg(OldCW, RegState::Kill)
         .addImm(0x300);
 
     // Extract to 16 bits.
     Register NewCW16 =
         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+    BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
         .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
     // Prepare memory for FLDCW.
     int NewCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
                       NewCWFrameIdx)
         .addReg(NewCW16, RegState::Kill);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
                       NewCWFrameIdx);
 
     // Do the addition.
     if (MI.getOpcode() == X86::FP80_ADDr) {
-      BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
+      BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
           .add(MI.getOperand(0))
           .add(MI.getOperand(1))
           .add(MI.getOperand(2));
     } else {
-      BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
+      BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
           .add(MI.getOperand(0))
           .add(MI.getOperand(1))
           .add(MI.getOperand(2))
@@ -38395,7 +38410,7 @@
     }
 
     // Reload the original control word now.
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
                       OrigCWFrameIdx);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
@@ -38415,34 +38430,34 @@
     // mode when truncating to an integer value.
     int OrigCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
-    addFrameReference(BuildMI(*BB, MI, DL,
-                              TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
+                      OrigCWFrameIdx);
 
     // Load the old value of the control word...
     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
                       OrigCWFrameIdx);
 
     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-    BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+    BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
       .addReg(OldCW, RegState::Kill).addImm(0xC00);
 
     // Extract to 16 bits.
     Register NewCW16 =
         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+    BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
     // Prepare memory for FLDCW.
     int NewCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
                       NewCWFrameIdx)
       .addReg(NewCW16, RegState::Kill);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(*BB, MI, DL,
+    addFrameReference(BuildMI(*BB, MI, MIMD,
                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
 
     // Get the X86 opcode to use.
@@ -38461,12 +38476,12 @@
     }
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
-    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+    addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
 
     // Reload the original control word now.
-    addFrameReference(BuildMI(*BB, MI, DL,
-                              TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
+                      OrigCWFrameIdx);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
@@ -38551,7 +38566,7 @@
     }
     MachineBasicBlock::iterator MBBI(RMBBI);
     addFullAddress(
-        BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+        BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
 
     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
 
@@ -38567,21 +38582,21 @@
       // Save RBX into a virtual register.
       Register SaveRBX =
           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
           .addReg(X86::RBX);
       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
       MachineInstrBuilder MIB =
-          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
+          BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
         MIB.add(MI.getOperand(Idx));
       MIB.add(MI.getOperand(X86::AddrNumOperands));
       MIB.addReg(SaveRBX);
     } else {
       // Simple case, just copy the virtual register to RBX.
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
           .add(MI.getOperand(X86::AddrNumOperands));
       MachineInstrBuilder MIB =
-          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
+          BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
         MIB.add(MI.getOperand(Idx));
     }
@@ -38595,32 +38610,32 @@
     // If no need to save the base pointer, we generate MWAITXrrr,
     // else we generate pseudo MWAITX_SAVE_RBX.
     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
           .addReg(MI.getOperand(0).getReg());
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
           .addReg(MI.getOperand(1).getReg());
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
           .addReg(MI.getOperand(2).getReg());
-      BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
       MI.eraseFromParent();
     } else {
       if (!BB->isLiveIn(BasePtr)) {
         BB->addLiveIn(BasePtr);
       }
       // Parameters can be copied into ECX and EAX but not EBX yet.
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
           .addReg(MI.getOperand(0).getReg());
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
           .addReg(MI.getOperand(1).getReg());
       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
       // Save RBX into a virtual register.
       Register SaveRBX =
           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+      BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
           .addReg(X86::RBX);
       // Generate mwaitx pseudo.
       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
-      BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
           .addDef(Dst) // Destination tied in with SaveRBX.
           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
           .addUse(SaveRBX);                  // Save of base pointer.
@@ -38637,7 +38652,7 @@
     assert(StackAdjustment != 0 && "0 stack adjustment");
     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
                       << StackAdjustment << "\n");
-    BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+    BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
         .addReg(X86::ESP)
         .addImm(StackAdjustment);
     MI.eraseFromParent();
@@ -38652,9 +38667,9 @@
     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
                       << ", arg offset " << ArgOffset << "\n");
     // stack pointer + offset
-    addRegOffset(
-        BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
-        X86::ESP, false, ArgOffset);
+    addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
+                         MI.getOperand(0).getReg()),
+                 X86::ESP, false, ArgOffset);
     MI.eraseFromParent();
     return BB;
   }
@@ -38675,7 +38690,7 @@
     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
     }
 
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
@@ -38686,7 +38701,7 @@
   }
   case X86::PTILEZERO: {
     unsigned Imm = MI.getOperand(0).getImm();
-    BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+    BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
@@ -38701,7 +38716,7 @@
     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
     }
 
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
     if (Opc != X86::TILESTORED)
       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
@@ -38722,14 +38737,14 @@
   }
   case X86::PTCMMIMFP16PS:
   case X86::PTCMMRLFP16PS: {
-    const DebugLoc &DL = MI.getDebugLoc();
+    const MIMetadata MIMD(MI);
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("Unexpected instruction!");
     case X86::PTCMMIMFP16PS:     Opc = X86::TCMMIMFP16PS;     break;
     case X86::PTCMMRLFP16PS:     Opc = X86::TCMMRLFP16PS;     break;
     }
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
@@ -59577,12 +59592,12 @@
         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
         "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+    BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+      BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
               TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
@@ -59651,7 +59666,7 @@
     break;
   }
 
-  return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK))
+  return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
       .addReg(TargetReg)
       .addImm(MBBI->getCFIType())
       .getInstr();
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -5,10 +5,10 @@
 ; access, and end with another non-atomic access; this is to test that the
 ; !pcsections propagation doesn't accidentally touch adjacent instructions.
 ;
-; RUN: llc -O0 < %s | FileCheck %s --check-prefixes=O0
-; RUN: llc -O1 < %s | FileCheck %s --check-prefixes=O1
-; RUN: llc -O2 < %s | FileCheck %s --check-prefixes=O2
-; RUN: llc -O3 < %s | FileCheck %s --check-prefixes=O3
+; RUN: llc -O0 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O0
+; RUN: llc -O1 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O1
+; RUN: llc -O2 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O2
+; RUN: llc -O3 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O3
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -9979,4 +9979,6320 @@
   ret i64 2
 }
 
+define i128 @atomic128_load_unordered(ptr %a) {
+; O0-LABEL: atomic128_load_unordered:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection414:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ebx
+; O0-NEXT:    movq %rbx, %rax
+; O0-NEXT:    movq %rbx, %rdx
+; O0-NEXT:    movq %rbx, %rcx
+; O0-NEXT:  .Lpcsection415:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo(%rip)
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_load_unordered:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection328:
+; O1-NEXT:    xorl %eax, %eax
+; O1-NEXT:  .Lpcsection329:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection330:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection331:
+; O1-NEXT:    xorl %ebx, %ebx
+; O1-NEXT:  .Lpcsection332:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_load_unordered:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection328:
+; O2-NEXT:    xorl %eax, %eax
+; O2-NEXT:  .Lpcsection329:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection330:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection331:
+; O2-NEXT:    xorl %ebx, %ebx
+; O2-NEXT:  .Lpcsection332:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_load_unordered:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection328:
+; O3-NEXT:    xorl %eax, %eax
+; O3-NEXT:  .Lpcsection329:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection330:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection331:
+; O3-NEXT:    xorl %ebx, %ebx
+; O3-NEXT:  .Lpcsection332:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = load atomic i128, ptr %a unordered, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret i128 %x
+}
+
+define i128 @atomic128_load_monotonic(ptr %a) {
+; O0-LABEL: atomic128_load_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection416:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ebx
+; O0-NEXT:    movq %rbx, %rax
+; O0-NEXT:    movq %rbx, %rdx
+; O0-NEXT:    movq %rbx, %rcx
+; O0-NEXT:  .Lpcsection417:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo(%rip)
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_load_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection333:
+; O1-NEXT:    xorl %eax, %eax
+; O1-NEXT:  .Lpcsection334:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection335:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection336:
+; O1-NEXT:    xorl %ebx, %ebx
+; O1-NEXT:  .Lpcsection337:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_load_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection333:
+; O2-NEXT:    xorl %eax, %eax
+; O2-NEXT:  .Lpcsection334:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection335:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection336:
+; O2-NEXT:    xorl %ebx, %ebx
+; O2-NEXT:  .Lpcsection337:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_load_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection333:
+; O3-NEXT:    xorl %eax, %eax
+; O3-NEXT:  .Lpcsection334:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection335:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection336:
+; O3-NEXT:    xorl %ebx, %ebx
+; O3-NEXT:  .Lpcsection337:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = load atomic i128, ptr %a monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret i128 %x
+}
+
+define i128 @atomic128_load_acquire(ptr %a) {
+; O0-LABEL: atomic128_load_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection418:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ebx
+; O0-NEXT:    movq %rbx, %rax
+; O0-NEXT:    movq %rbx, %rdx
+; O0-NEXT:    movq %rbx, %rcx
+; O0-NEXT:  .Lpcsection419:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo(%rip)
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_load_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection338:
+; O1-NEXT:    xorl %eax, %eax
+; O1-NEXT:  .Lpcsection339:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection340:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection341:
+; O1-NEXT:    xorl %ebx, %ebx
+; O1-NEXT:  .Lpcsection342:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_load_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection338:
+; O2-NEXT:    xorl %eax, %eax
+; O2-NEXT:  .Lpcsection339:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection340:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection341:
+; O2-NEXT:    xorl %ebx, %ebx
+; O2-NEXT:  .Lpcsection342:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_load_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection338:
+; O3-NEXT:    xorl %eax, %eax
+; O3-NEXT:  .Lpcsection339:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection340:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection341:
+; O3-NEXT:    xorl %ebx, %ebx
+; O3-NEXT:  .Lpcsection342:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = load atomic i128, ptr %a acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret i128 %x
+}
+
+define i128 @atomic128_load_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_load_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection420:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ebx
+; O0-NEXT:    movq %rbx, %rax
+; O0-NEXT:    movq %rbx, %rdx
+; O0-NEXT:    movq %rbx, %rcx
+; O0-NEXT:  .Lpcsection421:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo(%rip)
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_load_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection343:
+; O1-NEXT:    xorl %eax, %eax
+; O1-NEXT:  .Lpcsection344:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection345:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection346:
+; O1-NEXT:    xorl %ebx, %ebx
+; O1-NEXT:  .Lpcsection347:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_load_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection343:
+; O2-NEXT:    xorl %eax, %eax
+; O2-NEXT:  .Lpcsection344:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection345:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection346:
+; O2-NEXT:    xorl %ebx, %ebx
+; O2-NEXT:  .Lpcsection347:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_load_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection343:
+; O3-NEXT:    xorl %eax, %eax
+; O3-NEXT:  .Lpcsection344:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection345:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection346:
+; O3-NEXT:    xorl %ebx, %ebx
+; O3-NEXT:  .Lpcsection347:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = load atomic i128, ptr %a seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret i128 %x
+}
+
+define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
+; O0-LABEL: atomic128_load_seq_cst_ptr_ty:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection422:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_load_seq_cst_ptr_ty:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection348:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_load_seq_cst_ptr_ty:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection348:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_load_seq_cst_ptr_ty:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection348:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = load atomic ptr, ptr %a seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret ptr %x
+}
+
+define void @atomic128_store_unordered(ptr %a) {
+; O0-LABEL: atomic128_store_unordered:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection423:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection424:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection425:
+; O0-NEXT:    jmp .LBB203_1
+; O0-NEXT:  .LBB203_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection426:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection427:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection428:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection429:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection430:
+; O0-NEXT:    jne .LBB203_1
+; O0-NEXT:    jmp .LBB203_2
+; O0-NEXT:  .LBB203_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_store_unordered:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection349:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection350:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection351:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB203_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection352:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection353:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection354:
+; O1-NEXT:    jne .LBB203_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_store_unordered:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection349:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection350:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection351:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB203_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection352:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection353:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection354:
+; O2-NEXT:    jne .LBB203_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_store_unordered:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection349:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection350:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection351:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB203_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection352:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection353:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection354:
+; O3-NEXT:    jne .LBB203_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  store atomic i128 42, ptr %a unordered, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_store_monotonic(ptr %a) {
+; O0-LABEL: atomic128_store_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection431:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection432:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection433:
+; O0-NEXT:    jmp .LBB204_1
+; O0-NEXT:  .LBB204_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection434:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection435:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection436:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection437:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection438:
+; O0-NEXT:    jne .LBB204_1
+; O0-NEXT:    jmp .LBB204_2
+; O0-NEXT:  .LBB204_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_store_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection355:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection356:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection357:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB204_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection358:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection359:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection360:
+; O1-NEXT:    jne .LBB204_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_store_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection355:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection356:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection357:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB204_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection358:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection359:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection360:
+; O2-NEXT:    jne .LBB204_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_store_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection355:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection356:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection357:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB204_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection358:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection359:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection360:
+; O3-NEXT:    jne .LBB204_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  store atomic i128 42, ptr %a monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_store_release(ptr %a) {
+; O0-LABEL: atomic128_store_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection439:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection440:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection441:
+; O0-NEXT:    jmp .LBB205_1
+; O0-NEXT:  .LBB205_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection442:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection443:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection444:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection445:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection446:
+; O0-NEXT:    jne .LBB205_1
+; O0-NEXT:    jmp .LBB205_2
+; O0-NEXT:  .LBB205_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_store_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection361:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection362:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection363:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB205_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection364:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection365:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection366:
+; O1-NEXT:    jne .LBB205_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_store_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection361:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection362:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection363:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB205_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection364:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection365:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection366:
+; O2-NEXT:    jne .LBB205_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_store_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection361:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection362:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection363:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB205_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection364:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection365:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection366:
+; O3-NEXT:    jne .LBB205_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  store atomic i128 42, ptr %a release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_store_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_store_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection447:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection448:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection449:
+; O0-NEXT:    jmp .LBB206_1
+; O0-NEXT:  .LBB206_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection450:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection451:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection452:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection453:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection454:
+; O0-NEXT:    jne .LBB206_1
+; O0-NEXT:    jmp .LBB206_2
+; O0-NEXT:  .LBB206_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_store_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection367:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection368:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection369:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB206_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection370:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection371:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection372:
+; O1-NEXT:    jne .LBB206_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_store_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection367:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection368:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection369:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB206_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection370:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection371:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection372:
+; O2-NEXT:    jne .LBB206_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_store_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection367:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection368:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection369:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB206_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection370:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection371:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection372:
+; O3-NEXT:    jne .LBB206_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  store atomic i128 42, ptr %a seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
+; O0-LABEL: atomic128_store_seq_cst_ptr_ty:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection455:
+; O0-NEXT:    xchgq %rsi, (%rdi)
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_store_seq_cst_ptr_ty:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection373:
+; O1-NEXT:    xchgq %rsi, (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_store_seq_cst_ptr_ty:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection373:
+; O2-NEXT:    xchgq %rsi, (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_store_seq_cst_ptr_ty:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection373:
+; O3-NEXT:    xchgq %rsi, (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  store atomic ptr %v, ptr %a seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xchg_monotonic(ptr %a) {
+; O0-LABEL: atomic128_xchg_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection456:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection457:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection458:
+; O0-NEXT:    jmp .LBB208_1
+; O0-NEXT:  .LBB208_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection459:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection460:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection461:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection462:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection463:
+; O0-NEXT:    jne .LBB208_1
+; O0-NEXT:    jmp .LBB208_2
+; O0-NEXT:  .LBB208_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xchg_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection374:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection375:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection376:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB208_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection377:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection378:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection379:
+; O1-NEXT:    jne .LBB208_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xchg_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection374:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection375:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection376:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB208_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection377:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection378:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection379:
+; O2-NEXT:    jne .LBB208_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xchg_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection374:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection375:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection376:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB208_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection377:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection378:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection379:
+; O3-NEXT:    jne .LBB208_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xchg ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_add_monotonic(ptr %a) {
+; O0-LABEL: atomic128_add_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection464:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection465:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection466:
+; O0-NEXT:    jmp .LBB209_1
+; O0-NEXT:  .LBB209_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection467:
+; O0-NEXT:    addq $42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection468:
+; O0-NEXT:    adcq $0, %rcx
+; O0-NEXT:  .Lpcsection469:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection470:
+; O0-NEXT:    jne .LBB209_1
+; O0-NEXT:    jmp .LBB209_2
+; O0-NEXT:  .LBB209_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_add_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection380:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection381:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB209_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection382:
+; O1-NEXT:    addq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection383:
+; O1-NEXT:    adcq $0, %rcx
+; O1-NEXT:  .Lpcsection384:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection385:
+; O1-NEXT:    jne .LBB209_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_add_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection380:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection381:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB209_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection382:
+; O2-NEXT:    addq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection383:
+; O2-NEXT:    adcq $0, %rcx
+; O2-NEXT:  .Lpcsection384:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection385:
+; O2-NEXT:    jne .LBB209_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_add_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection380:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection381:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB209_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection382:
+; O3-NEXT:    addq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection383:
+; O3-NEXT:    adcq $0, %rcx
+; O3-NEXT:  .Lpcsection384:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection385:
+; O3-NEXT:    jne .LBB209_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw add ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_sub_monotonic(ptr %a) {
+; O0-LABEL: atomic128_sub_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection471:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection472:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection473:
+; O0-NEXT:    jmp .LBB210_1
+; O0-NEXT:  .LBB210_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection474:
+; O0-NEXT:    addq $-42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection475:
+; O0-NEXT:    adcq $-1, %rcx
+; O0-NEXT:  .Lpcsection476:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection477:
+; O0-NEXT:    jne .LBB210_1
+; O0-NEXT:    jmp .LBB210_2
+; O0-NEXT:  .LBB210_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_sub_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection386:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection387:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB210_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection388:
+; O1-NEXT:    addq $-42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection389:
+; O1-NEXT:    adcq $-1, %rcx
+; O1-NEXT:  .Lpcsection390:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection391:
+; O1-NEXT:    jne .LBB210_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_sub_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection386:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection387:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB210_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection388:
+; O2-NEXT:    addq $-42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection389:
+; O2-NEXT:    adcq $-1, %rcx
+; O2-NEXT:  .Lpcsection390:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection391:
+; O2-NEXT:    jne .LBB210_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_sub_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection386:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection387:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB210_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection388:
+; O3-NEXT:    addq $-42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection389:
+; O3-NEXT:    adcq $-1, %rcx
+; O3-NEXT:  .Lpcsection390:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection391:
+; O3-NEXT:    jne .LBB210_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw sub ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_and_monotonic(ptr %a) {
+; O0-LABEL: atomic128_and_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection478:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection479:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection480:
+; O0-NEXT:    jmp .LBB211_1
+; O0-NEXT:  .LBB211_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection481:
+; O0-NEXT:    andl $42, %ecx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection482:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection483:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection484:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection485:
+; O0-NEXT:    jne .LBB211_1
+; O0-NEXT:    jmp .LBB211_2
+; O0-NEXT:  .LBB211_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_and_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection392:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection393:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB211_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection394:
+; O1-NEXT:    andl $42, %ebx
+; O1-NEXT:  .Lpcsection395:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection396:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection397:
+; O1-NEXT:    jne .LBB211_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_and_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection392:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection393:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB211_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection394:
+; O2-NEXT:    andl $42, %ebx
+; O2-NEXT:  .Lpcsection395:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection396:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection397:
+; O2-NEXT:    jne .LBB211_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_and_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection392:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection393:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB211_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection394:
+; O3-NEXT:    andl $42, %ebx
+; O3-NEXT:  .Lpcsection395:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection396:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection397:
+; O3-NEXT:    jne .LBB211_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw and ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_or_monotonic(ptr %a) {
+; O0-LABEL: atomic128_or_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection486:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection487:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection488:
+; O0-NEXT:    jmp .LBB212_1
+; O0-NEXT:  .LBB212_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection489:
+; O0-NEXT:    orq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection490:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection491:
+; O0-NEXT:    jne .LBB212_1
+; O0-NEXT:    jmp .LBB212_2
+; O0-NEXT:  .LBB212_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_or_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection398:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection399:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB212_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection400:
+; O1-NEXT:    orq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection401:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection402:
+; O1-NEXT:    jne .LBB212_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_or_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection398:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection399:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB212_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection400:
+; O2-NEXT:    orq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection401:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection402:
+; O2-NEXT:    jne .LBB212_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_or_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection398:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection399:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB212_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection400:
+; O3-NEXT:    orq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection401:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection402:
+; O3-NEXT:    jne .LBB212_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw or ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xor_monotonic(ptr %a) {
+; O0-LABEL: atomic128_xor_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection492:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection493:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection494:
+; O0-NEXT:    jmp .LBB213_1
+; O0-NEXT:  .LBB213_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection495:
+; O0-NEXT:    xorq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection496:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection497:
+; O0-NEXT:    jne .LBB213_1
+; O0-NEXT:    jmp .LBB213_2
+; O0-NEXT:  .LBB213_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xor_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection403:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection404:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB213_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection405:
+; O1-NEXT:    xorq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection406:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection407:
+; O1-NEXT:    jne .LBB213_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xor_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection403:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection404:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB213_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection405:
+; O2-NEXT:    xorq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection406:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection407:
+; O2-NEXT:    jne .LBB213_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xor_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection403:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection404:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB213_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection405:
+; O3-NEXT:    xorq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection406:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection407:
+; O3-NEXT:    jne .LBB213_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xor ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_nand_monotonic(ptr %a) {
+; O0-LABEL: atomic128_nand_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection498:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection499:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection500:
+; O0-NEXT:    jmp .LBB214_1
+; O0-NEXT:  .LBB214_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection501:
+; O0-NEXT:    notl %ecx
+; O0-NEXT:  .Lpcsection502:
+; O0-NEXT:    # implicit-def: $rbx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection503:
+; O0-NEXT:    orq $-43, %rbx
+; O0-NEXT:  .Lpcsection504:
+; O0-NEXT:    movq $-1, %rcx
+; O0-NEXT:  .Lpcsection505:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection506:
+; O0-NEXT:    jne .LBB214_1
+; O0-NEXT:    jmp .LBB214_2
+; O0-NEXT:  .LBB214_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_nand_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection408:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection409:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection410:
+; O1-NEXT:    movq $-1, %rcx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB214_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection411:
+; O1-NEXT:    notl %ebx
+; O1-NEXT:  .Lpcsection412:
+; O1-NEXT:    orq $-43, %rbx
+; O1-NEXT:  .Lpcsection413:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection414:
+; O1-NEXT:    jne .LBB214_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_nand_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection408:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection409:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection410:
+; O2-NEXT:    movq $-1, %rcx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB214_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection411:
+; O2-NEXT:    notl %ebx
+; O2-NEXT:  .Lpcsection412:
+; O2-NEXT:    orq $-43, %rbx
+; O2-NEXT:  .Lpcsection413:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection414:
+; O2-NEXT:    jne .LBB214_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_nand_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection408:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection409:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection410:
+; O3-NEXT:    movq $-1, %rcx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB214_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection411:
+; O3-NEXT:    notl %ebx
+; O3-NEXT:  .Lpcsection412:
+; O3-NEXT:    orq $-43, %rbx
+; O3-NEXT:  .Lpcsection413:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection414:
+; O3-NEXT:    jne .LBB214_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw nand ptr %a, i128 42 monotonic, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xchg_acquire(ptr %a) {
+; O0-LABEL: atomic128_xchg_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection507:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection508:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection509:
+; O0-NEXT:    jmp .LBB215_1
+; O0-NEXT:  .LBB215_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection510:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection511:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection512:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection513:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection514:
+; O0-NEXT:    jne .LBB215_1
+; O0-NEXT:    jmp .LBB215_2
+; O0-NEXT:  .LBB215_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xchg_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection415:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection416:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection417:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB215_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection418:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection419:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection420:
+; O1-NEXT:    jne .LBB215_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xchg_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection415:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection416:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection417:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB215_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection418:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection419:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection420:
+; O2-NEXT:    jne .LBB215_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xchg_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection415:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection416:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection417:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB215_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection418:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection419:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection420:
+; O3-NEXT:    jne .LBB215_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xchg ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_add_acquire(ptr %a) {
+; O0-LABEL: atomic128_add_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection515:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection516:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection517:
+; O0-NEXT:    jmp .LBB216_1
+; O0-NEXT:  .LBB216_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection518:
+; O0-NEXT:    addq $42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection519:
+; O0-NEXT:    adcq $0, %rcx
+; O0-NEXT:  .Lpcsection520:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection521:
+; O0-NEXT:    jne .LBB216_1
+; O0-NEXT:    jmp .LBB216_2
+; O0-NEXT:  .LBB216_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_add_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection421:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection422:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB216_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection423:
+; O1-NEXT:    addq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection424:
+; O1-NEXT:    adcq $0, %rcx
+; O1-NEXT:  .Lpcsection425:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection426:
+; O1-NEXT:    jne .LBB216_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_add_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection421:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection422:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB216_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection423:
+; O2-NEXT:    addq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection424:
+; O2-NEXT:    adcq $0, %rcx
+; O2-NEXT:  .Lpcsection425:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection426:
+; O2-NEXT:    jne .LBB216_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_add_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection421:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection422:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB216_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection423:
+; O3-NEXT:    addq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection424:
+; O3-NEXT:    adcq $0, %rcx
+; O3-NEXT:  .Lpcsection425:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection426:
+; O3-NEXT:    jne .LBB216_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw add ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_sub_acquire(ptr %a) {
+; O0-LABEL: atomic128_sub_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection522:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection523:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection524:
+; O0-NEXT:    jmp .LBB217_1
+; O0-NEXT:  .LBB217_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection525:
+; O0-NEXT:    addq $-42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection526:
+; O0-NEXT:    adcq $-1, %rcx
+; O0-NEXT:  .Lpcsection527:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection528:
+; O0-NEXT:    jne .LBB217_1
+; O0-NEXT:    jmp .LBB217_2
+; O0-NEXT:  .LBB217_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_sub_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection427:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection428:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB217_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection429:
+; O1-NEXT:    addq $-42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection430:
+; O1-NEXT:    adcq $-1, %rcx
+; O1-NEXT:  .Lpcsection431:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection432:
+; O1-NEXT:    jne .LBB217_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_sub_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection427:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection428:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB217_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection429:
+; O2-NEXT:    addq $-42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection430:
+; O2-NEXT:    adcq $-1, %rcx
+; O2-NEXT:  .Lpcsection431:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection432:
+; O2-NEXT:    jne .LBB217_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_sub_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection427:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection428:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB217_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection429:
+; O3-NEXT:    addq $-42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection430:
+; O3-NEXT:    adcq $-1, %rcx
+; O3-NEXT:  .Lpcsection431:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection432:
+; O3-NEXT:    jne .LBB217_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw sub ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_and_acquire(ptr %a) {
+; O0-LABEL: atomic128_and_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection529:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection530:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection531:
+; O0-NEXT:    jmp .LBB218_1
+; O0-NEXT:  .LBB218_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection532:
+; O0-NEXT:    andl $42, %ecx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection533:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection534:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection535:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection536:
+; O0-NEXT:    jne .LBB218_1
+; O0-NEXT:    jmp .LBB218_2
+; O0-NEXT:  .LBB218_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_and_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection433:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection434:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB218_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection435:
+; O1-NEXT:    andl $42, %ebx
+; O1-NEXT:  .Lpcsection436:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection437:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection438:
+; O1-NEXT:    jne .LBB218_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_and_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection433:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection434:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB218_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection435:
+; O2-NEXT:    andl $42, %ebx
+; O2-NEXT:  .Lpcsection436:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection437:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection438:
+; O2-NEXT:    jne .LBB218_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_and_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection433:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection434:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB218_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection435:
+; O3-NEXT:    andl $42, %ebx
+; O3-NEXT:  .Lpcsection436:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection437:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection438:
+; O3-NEXT:    jne .LBB218_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw and ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_or_acquire(ptr %a) {
+; O0-LABEL: atomic128_or_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection537:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection538:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection539:
+; O0-NEXT:    jmp .LBB219_1
+; O0-NEXT:  .LBB219_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection540:
+; O0-NEXT:    orq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection541:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection542:
+; O0-NEXT:    jne .LBB219_1
+; O0-NEXT:    jmp .LBB219_2
+; O0-NEXT:  .LBB219_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_or_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection439:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection440:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB219_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection441:
+; O1-NEXT:    orq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection442:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection443:
+; O1-NEXT:    jne .LBB219_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_or_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection439:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection440:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB219_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection441:
+; O2-NEXT:    orq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection442:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection443:
+; O2-NEXT:    jne .LBB219_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_or_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection439:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection440:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB219_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection441:
+; O3-NEXT:    orq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection442:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection443:
+; O3-NEXT:    jne .LBB219_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw or ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xor_acquire(ptr %a) {
+; O0-LABEL: atomic128_xor_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection543:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection544:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection545:
+; O0-NEXT:    jmp .LBB220_1
+; O0-NEXT:  .LBB220_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection546:
+; O0-NEXT:    xorq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection547:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection548:
+; O0-NEXT:    jne .LBB220_1
+; O0-NEXT:    jmp .LBB220_2
+; O0-NEXT:  .LBB220_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xor_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection444:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection445:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB220_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection446:
+; O1-NEXT:    xorq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection447:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection448:
+; O1-NEXT:    jne .LBB220_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xor_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection444:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection445:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB220_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection446:
+; O2-NEXT:    xorq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection447:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection448:
+; O2-NEXT:    jne .LBB220_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xor_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection444:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection445:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB220_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection446:
+; O3-NEXT:    xorq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection447:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection448:
+; O3-NEXT:    jne .LBB220_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xor ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_nand_acquire(ptr %a) {
+; O0-LABEL: atomic128_nand_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection549:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection550:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection551:
+; O0-NEXT:    jmp .LBB221_1
+; O0-NEXT:  .LBB221_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection552:
+; O0-NEXT:    notl %ecx
+; O0-NEXT:  .Lpcsection553:
+; O0-NEXT:    # implicit-def: $rbx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection554:
+; O0-NEXT:    orq $-43, %rbx
+; O0-NEXT:  .Lpcsection555:
+; O0-NEXT:    movq $-1, %rcx
+; O0-NEXT:  .Lpcsection556:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection557:
+; O0-NEXT:    jne .LBB221_1
+; O0-NEXT:    jmp .LBB221_2
+; O0-NEXT:  .LBB221_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_nand_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection449:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection450:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection451:
+; O1-NEXT:    movq $-1, %rcx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB221_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection452:
+; O1-NEXT:    notl %ebx
+; O1-NEXT:  .Lpcsection453:
+; O1-NEXT:    orq $-43, %rbx
+; O1-NEXT:  .Lpcsection454:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection455:
+; O1-NEXT:    jne .LBB221_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_nand_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection449:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection450:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection451:
+; O2-NEXT:    movq $-1, %rcx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB221_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection452:
+; O2-NEXT:    notl %ebx
+; O2-NEXT:  .Lpcsection453:
+; O2-NEXT:    orq $-43, %rbx
+; O2-NEXT:  .Lpcsection454:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection455:
+; O2-NEXT:    jne .LBB221_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_nand_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection449:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection450:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection451:
+; O3-NEXT:    movq $-1, %rcx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB221_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection452:
+; O3-NEXT:    notl %ebx
+; O3-NEXT:  .Lpcsection453:
+; O3-NEXT:    orq $-43, %rbx
+; O3-NEXT:  .Lpcsection454:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection455:
+; O3-NEXT:    jne .LBB221_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw nand ptr %a, i128 42 acquire, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xchg_release(ptr %a) {
+; O0-LABEL: atomic128_xchg_release:
+; O0:       # %bb.0:
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection558:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection559:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection560:
+; O0-NEXT:    jmp .LBB222_1
+; O0-NEXT:  .LBB222_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection561:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection562:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection563:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection564:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection565:
+; O0-NEXT:    jne .LBB222_1
+; O0-NEXT:    jmp .LBB222_2
+; O0-NEXT:  .LBB222_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xchg_release:
+; O1:       # %bb.0:
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection456:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection457:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection458:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB222_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection459:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection460:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection461:
+; O1-NEXT:    jne .LBB222_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xchg_release:
+; O2:       # %bb.0:
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection456:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection457:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection458:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB222_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection459:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection460:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection461:
+; O2-NEXT:    jne .LBB222_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xchg_release:
+; O3:       # %bb.0:
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection456:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection457:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection458:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB222_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection459:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection460:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection461:
+; O3-NEXT:    jne .LBB222_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xchg ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_add_release(ptr %a) {
+; O0-LABEL: atomic128_add_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection566:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection567:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection568:
+; O0-NEXT:    jmp .LBB223_1
+; O0-NEXT:  .LBB223_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection569:
+; O0-NEXT:    addq $42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection570:
+; O0-NEXT:    adcq $0, %rcx
+; O0-NEXT:  .Lpcsection571:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection572:
+; O0-NEXT:    jne .LBB223_1
+; O0-NEXT:    jmp .LBB223_2
+; O0-NEXT:  .LBB223_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_add_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection462:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection463:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB223_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection464:
+; O1-NEXT:    addq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection465:
+; O1-NEXT:    adcq $0, %rcx
+; O1-NEXT:  .Lpcsection466:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection467:
+; O1-NEXT:    jne .LBB223_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_add_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection462:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection463:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB223_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection464:
+; O2-NEXT:    addq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection465:
+; O2-NEXT:    adcq $0, %rcx
+; O2-NEXT:  .Lpcsection466:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection467:
+; O2-NEXT:    jne .LBB223_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_add_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection462:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection463:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB223_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection464:
+; O3-NEXT:    addq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection465:
+; O3-NEXT:    adcq $0, %rcx
+; O3-NEXT:  .Lpcsection466:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection467:
+; O3-NEXT:    jne .LBB223_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw add ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_sub_release(ptr %a) {
+; O0-LABEL: atomic128_sub_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection573:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection574:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection575:
+; O0-NEXT:    jmp .LBB224_1
+; O0-NEXT:  .LBB224_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection576:
+; O0-NEXT:    addq $-42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection577:
+; O0-NEXT:    adcq $-1, %rcx
+; O0-NEXT:  .Lpcsection578:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection579:
+; O0-NEXT:    jne .LBB224_1
+; O0-NEXT:    jmp .LBB224_2
+; O0-NEXT:  .LBB224_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_sub_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection468:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection469:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB224_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection470:
+; O1-NEXT:    addq $-42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection471:
+; O1-NEXT:    adcq $-1, %rcx
+; O1-NEXT:  .Lpcsection472:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection473:
+; O1-NEXT:    jne .LBB224_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_sub_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection468:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection469:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB224_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection470:
+; O2-NEXT:    addq $-42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection471:
+; O2-NEXT:    adcq $-1, %rcx
+; O2-NEXT:  .Lpcsection472:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection473:
+; O2-NEXT:    jne .LBB224_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_sub_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection468:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection469:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB224_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection470:
+; O3-NEXT:    addq $-42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection471:
+; O3-NEXT:    adcq $-1, %rcx
+; O3-NEXT:  .Lpcsection472:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection473:
+; O3-NEXT:    jne .LBB224_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw sub ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_and_release(ptr %a) {
+; O0-LABEL: atomic128_and_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection580:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection581:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection582:
+; O0-NEXT:    jmp .LBB225_1
+; O0-NEXT:  .LBB225_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection583:
+; O0-NEXT:    andl $42, %ecx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection584:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection585:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection586:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection587:
+; O0-NEXT:    jne .LBB225_1
+; O0-NEXT:    jmp .LBB225_2
+; O0-NEXT:  .LBB225_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_and_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection474:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection475:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB225_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection476:
+; O1-NEXT:    andl $42, %ebx
+; O1-NEXT:  .Lpcsection477:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection478:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection479:
+; O1-NEXT:    jne .LBB225_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_and_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection474:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection475:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB225_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection476:
+; O2-NEXT:    andl $42, %ebx
+; O2-NEXT:  .Lpcsection477:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection478:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection479:
+; O2-NEXT:    jne .LBB225_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_and_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection474:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection475:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB225_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection476:
+; O3-NEXT:    andl $42, %ebx
+; O3-NEXT:  .Lpcsection477:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection478:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection479:
+; O3-NEXT:    jne .LBB225_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw and ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_or_release(ptr %a) {
+; O0-LABEL: atomic128_or_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection588:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection589:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection590:
+; O0-NEXT:    jmp .LBB226_1
+; O0-NEXT:  .LBB226_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection591:
+; O0-NEXT:    orq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection592:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection593:
+; O0-NEXT:    jne .LBB226_1
+; O0-NEXT:    jmp .LBB226_2
+; O0-NEXT:  .LBB226_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_or_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection480:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection481:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB226_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection482:
+; O1-NEXT:    orq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection483:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection484:
+; O1-NEXT:    jne .LBB226_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_or_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection480:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection481:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB226_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection482:
+; O2-NEXT:    orq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection483:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection484:
+; O2-NEXT:    jne .LBB226_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_or_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection480:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection481:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB226_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection482:
+; O3-NEXT:    orq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection483:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection484:
+; O3-NEXT:    jne .LBB226_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw or ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xor_release(ptr %a) {
+; O0-LABEL: atomic128_xor_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection594:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection595:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection596:
+; O0-NEXT:    jmp .LBB227_1
+; O0-NEXT:  .LBB227_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection597:
+; O0-NEXT:    xorq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection598:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection599:
+; O0-NEXT:    jne .LBB227_1
+; O0-NEXT:    jmp .LBB227_2
+; O0-NEXT:  .LBB227_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xor_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection485:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection486:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB227_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection487:
+; O1-NEXT:    xorq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection488:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection489:
+; O1-NEXT:    jne .LBB227_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xor_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection485:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection486:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB227_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection487:
+; O2-NEXT:    xorq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection488:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection489:
+; O2-NEXT:    jne .LBB227_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xor_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection485:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection486:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB227_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection487:
+; O3-NEXT:    xorq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection488:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection489:
+; O3-NEXT:    jne .LBB227_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xor ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_nand_release(ptr %a) {
+; O0-LABEL: atomic128_nand_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection600:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection601:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection602:
+; O0-NEXT:    jmp .LBB228_1
+; O0-NEXT:  .LBB228_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection603:
+; O0-NEXT:    notl %ecx
+; O0-NEXT:  .Lpcsection604:
+; O0-NEXT:    # implicit-def: $rbx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection605:
+; O0-NEXT:    orq $-43, %rbx
+; O0-NEXT:  .Lpcsection606:
+; O0-NEXT:    movq $-1, %rcx
+; O0-NEXT:  .Lpcsection607:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection608:
+; O0-NEXT:    jne .LBB228_1
+; O0-NEXT:    jmp .LBB228_2
+; O0-NEXT:  .LBB228_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_nand_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection490:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection491:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection492:
+; O1-NEXT:    movq $-1, %rcx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB228_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection493:
+; O1-NEXT:    notl %ebx
+; O1-NEXT:  .Lpcsection494:
+; O1-NEXT:    orq $-43, %rbx
+; O1-NEXT:  .Lpcsection495:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection496:
+; O1-NEXT:    jne .LBB228_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_nand_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection490:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection491:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection492:
+; O2-NEXT:    movq $-1, %rcx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB228_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection493:
+; O2-NEXT:    notl %ebx
+; O2-NEXT:  .Lpcsection494:
+; O2-NEXT:    orq $-43, %rbx
+; O2-NEXT:  .Lpcsection495:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection496:
+; O2-NEXT:    jne .LBB228_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_nand_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection490:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection491:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection492:
+; O3-NEXT:    movq $-1, %rcx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB228_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection493:
+; O3-NEXT:    notl %ebx
+; O3-NEXT:  .Lpcsection494:
+; O3-NEXT:    orq $-43, %rbx
+; O3-NEXT:  .Lpcsection495:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection496:
+; O3-NEXT:    jne .LBB228_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw nand ptr %a, i128 42 release, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xchg_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_xchg_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection609:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection610:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection611:
+; O0-NEXT:    jmp .LBB229_1
+; O0-NEXT:  .LBB229_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection612:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection613:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection614:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection615:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection616:
+; O0-NEXT:    jne .LBB229_1
+; O0-NEXT:    jmp .LBB229_2
+; O0-NEXT:  .LBB229_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xchg_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection497:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection498:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection499:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB229_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection500:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection501:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection502:
+; O1-NEXT:    jne .LBB229_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xchg_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection497:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection498:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection499:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB229_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection500:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection501:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection502:
+; O2-NEXT:    jne .LBB229_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xchg_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection497:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection498:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection499:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB229_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection500:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection501:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection502:
+; O3-NEXT:    jne .LBB229_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xchg ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_add_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_add_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection617:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection618:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection619:
+; O0-NEXT:    jmp .LBB230_1
+; O0-NEXT:  .LBB230_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection620:
+; O0-NEXT:    addq $42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection621:
+; O0-NEXT:    adcq $0, %rcx
+; O0-NEXT:  .Lpcsection622:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection623:
+; O0-NEXT:    jne .LBB230_1
+; O0-NEXT:    jmp .LBB230_2
+; O0-NEXT:  .LBB230_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_add_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection503:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection504:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB230_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection505:
+; O1-NEXT:    addq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection506:
+; O1-NEXT:    adcq $0, %rcx
+; O1-NEXT:  .Lpcsection507:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection508:
+; O1-NEXT:    jne .LBB230_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_add_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection503:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection504:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB230_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection505:
+; O2-NEXT:    addq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection506:
+; O2-NEXT:    adcq $0, %rcx
+; O2-NEXT:  .Lpcsection507:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection508:
+; O2-NEXT:    jne .LBB230_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_add_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection503:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection504:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB230_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection505:
+; O3-NEXT:    addq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection506:
+; O3-NEXT:    adcq $0, %rcx
+; O3-NEXT:  .Lpcsection507:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection508:
+; O3-NEXT:    jne .LBB230_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw add ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_sub_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_sub_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection624:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection625:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection626:
+; O0-NEXT:    jmp .LBB231_1
+; O0-NEXT:  .LBB231_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection627:
+; O0-NEXT:    addq $-42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection628:
+; O0-NEXT:    adcq $-1, %rcx
+; O0-NEXT:  .Lpcsection629:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection630:
+; O0-NEXT:    jne .LBB231_1
+; O0-NEXT:    jmp .LBB231_2
+; O0-NEXT:  .LBB231_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_sub_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection509:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection510:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB231_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection511:
+; O1-NEXT:    addq $-42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection512:
+; O1-NEXT:    adcq $-1, %rcx
+; O1-NEXT:  .Lpcsection513:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection514:
+; O1-NEXT:    jne .LBB231_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_sub_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection509:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection510:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB231_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection511:
+; O2-NEXT:    addq $-42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection512:
+; O2-NEXT:    adcq $-1, %rcx
+; O2-NEXT:  .Lpcsection513:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection514:
+; O2-NEXT:    jne .LBB231_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_sub_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection509:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection510:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB231_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection511:
+; O3-NEXT:    addq $-42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection512:
+; O3-NEXT:    adcq $-1, %rcx
+; O3-NEXT:  .Lpcsection513:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection514:
+; O3-NEXT:    jne .LBB231_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw sub ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_and_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_and_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection631:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection632:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection633:
+; O0-NEXT:    jmp .LBB232_1
+; O0-NEXT:  .LBB232_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection634:
+; O0-NEXT:    andl $42, %ecx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection635:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection636:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection637:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection638:
+; O0-NEXT:    jne .LBB232_1
+; O0-NEXT:    jmp .LBB232_2
+; O0-NEXT:  .LBB232_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_and_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection515:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection516:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB232_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection517:
+; O1-NEXT:    andl $42, %ebx
+; O1-NEXT:  .Lpcsection518:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection519:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection520:
+; O1-NEXT:    jne .LBB232_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_and_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection515:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection516:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB232_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection517:
+; O2-NEXT:    andl $42, %ebx
+; O2-NEXT:  .Lpcsection518:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection519:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection520:
+; O2-NEXT:    jne .LBB232_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_and_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection515:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection516:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB232_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection517:
+; O3-NEXT:    andl $42, %ebx
+; O3-NEXT:  .Lpcsection518:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection519:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection520:
+; O3-NEXT:    jne .LBB232_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw and ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_or_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_or_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection639:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection640:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection641:
+; O0-NEXT:    jmp .LBB233_1
+; O0-NEXT:  .LBB233_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection642:
+; O0-NEXT:    orq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection643:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection644:
+; O0-NEXT:    jne .LBB233_1
+; O0-NEXT:    jmp .LBB233_2
+; O0-NEXT:  .LBB233_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_or_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection521:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection522:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB233_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection523:
+; O1-NEXT:    orq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection524:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection525:
+; O1-NEXT:    jne .LBB233_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_or_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection521:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection522:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB233_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection523:
+; O2-NEXT:    orq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection524:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection525:
+; O2-NEXT:    jne .LBB233_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_or_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection521:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection522:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB233_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection523:
+; O3-NEXT:    orq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection524:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection525:
+; O3-NEXT:    jne .LBB233_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw or ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xor_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_xor_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection645:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection646:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection647:
+; O0-NEXT:    jmp .LBB234_1
+; O0-NEXT:  .LBB234_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection648:
+; O0-NEXT:    xorq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection649:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection650:
+; O0-NEXT:    jne .LBB234_1
+; O0-NEXT:    jmp .LBB234_2
+; O0-NEXT:  .LBB234_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xor_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection526:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection527:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB234_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection528:
+; O1-NEXT:    xorq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection529:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection530:
+; O1-NEXT:    jne .LBB234_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xor_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection526:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection527:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB234_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection528:
+; O2-NEXT:    xorq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection529:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection530:
+; O2-NEXT:    jne .LBB234_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xor_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection526:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection527:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB234_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection528:
+; O3-NEXT:    xorq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection529:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection530:
+; O3-NEXT:    jne .LBB234_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xor ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_nand_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_nand_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection651:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection652:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection653:
+; O0-NEXT:    jmp .LBB235_1
+; O0-NEXT:  .LBB235_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection654:
+; O0-NEXT:    notl %ecx
+; O0-NEXT:  .Lpcsection655:
+; O0-NEXT:    # implicit-def: $rbx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection656:
+; O0-NEXT:    orq $-43, %rbx
+; O0-NEXT:  .Lpcsection657:
+; O0-NEXT:    movq $-1, %rcx
+; O0-NEXT:  .Lpcsection658:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection659:
+; O0-NEXT:    jne .LBB235_1
+; O0-NEXT:    jmp .LBB235_2
+; O0-NEXT:  .LBB235_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_nand_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection531:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection532:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection533:
+; O1-NEXT:    movq $-1, %rcx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB235_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection534:
+; O1-NEXT:    notl %ebx
+; O1-NEXT:  .Lpcsection535:
+; O1-NEXT:    orq $-43, %rbx
+; O1-NEXT:  .Lpcsection536:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection537:
+; O1-NEXT:    jne .LBB235_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_nand_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection531:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection532:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection533:
+; O2-NEXT:    movq $-1, %rcx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB235_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection534:
+; O2-NEXT:    notl %ebx
+; O2-NEXT:  .Lpcsection535:
+; O2-NEXT:    orq $-43, %rbx
+; O2-NEXT:  .Lpcsection536:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection537:
+; O2-NEXT:    jne .LBB235_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_nand_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection531:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection532:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection533:
+; O3-NEXT:    movq $-1, %rcx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB235_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection534:
+; O3-NEXT:    notl %ebx
+; O3-NEXT:  .Lpcsection535:
+; O3-NEXT:    orq $-43, %rbx
+; O3-NEXT:  .Lpcsection536:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection537:
+; O3-NEXT:    jne .LBB235_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw nand ptr %a, i128 42 acq_rel, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xchg_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_xchg_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection660:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection661:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection662:
+; O0-NEXT:    jmp .LBB236_1
+; O0-NEXT:  .LBB236_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:  .Lpcsection663:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection664:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection665:
+; O0-NEXT:    movl $42, %ebx
+; O0-NEXT:  .Lpcsection666:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection667:
+; O0-NEXT:    jne .LBB236_1
+; O0-NEXT:    jmp .LBB236_2
+; O0-NEXT:  .LBB236_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xchg_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection538:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection539:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection540:
+; O1-NEXT:    movl $42, %ebx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB236_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:  .Lpcsection541:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection542:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection543:
+; O1-NEXT:    jne .LBB236_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xchg_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection538:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection539:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection540:
+; O2-NEXT:    movl $42, %ebx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB236_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:  .Lpcsection541:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection542:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection543:
+; O2-NEXT:    jne .LBB236_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xchg_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection538:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection539:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection540:
+; O3-NEXT:    movl $42, %ebx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB236_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:  .Lpcsection541:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection542:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection543:
+; O3-NEXT:    jne .LBB236_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xchg ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_add_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_add_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection668:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection669:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection670:
+; O0-NEXT:    jmp .LBB237_1
+; O0-NEXT:  .LBB237_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection671:
+; O0-NEXT:    addq $42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection672:
+; O0-NEXT:    adcq $0, %rcx
+; O0-NEXT:  .Lpcsection673:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection674:
+; O0-NEXT:    jne .LBB237_1
+; O0-NEXT:    jmp .LBB237_2
+; O0-NEXT:  .LBB237_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_add_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection544:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection545:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB237_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection546:
+; O1-NEXT:    addq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection547:
+; O1-NEXT:    adcq $0, %rcx
+; O1-NEXT:  .Lpcsection548:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection549:
+; O1-NEXT:    jne .LBB237_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_add_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection544:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection545:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB237_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection546:
+; O2-NEXT:    addq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection547:
+; O2-NEXT:    adcq $0, %rcx
+; O2-NEXT:  .Lpcsection548:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection549:
+; O2-NEXT:    jne .LBB237_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_add_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection544:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection545:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB237_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection546:
+; O3-NEXT:    addq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection547:
+; O3-NEXT:    adcq $0, %rcx
+; O3-NEXT:  .Lpcsection548:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection549:
+; O3-NEXT:    jne .LBB237_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw add ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_sub_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_sub_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection675:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection676:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection677:
+; O0-NEXT:    jmp .LBB238_1
+; O0-NEXT:  .LBB238_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection678:
+; O0-NEXT:    addq $-42, %rbx
+; O0-NEXT:    movq %rdx, %rcx
+; O0-NEXT:  .Lpcsection679:
+; O0-NEXT:    adcq $-1, %rcx
+; O0-NEXT:  .Lpcsection680:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection681:
+; O0-NEXT:    jne .LBB238_1
+; O0-NEXT:    jmp .LBB238_2
+; O0-NEXT:  .LBB238_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_sub_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection550:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection551:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB238_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection552:
+; O1-NEXT:    addq $-42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection553:
+; O1-NEXT:    adcq $-1, %rcx
+; O1-NEXT:  .Lpcsection554:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection555:
+; O1-NEXT:    jne .LBB238_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_sub_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection550:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection551:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB238_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection552:
+; O2-NEXT:    addq $-42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection553:
+; O2-NEXT:    adcq $-1, %rcx
+; O2-NEXT:  .Lpcsection554:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection555:
+; O2-NEXT:    jne .LBB238_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_sub_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection550:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection551:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB238_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection552:
+; O3-NEXT:    addq $-42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection553:
+; O3-NEXT:    adcq $-1, %rcx
+; O3-NEXT:  .Lpcsection554:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection555:
+; O3-NEXT:    jne .LBB238_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw sub ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_and_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_and_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection682:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection683:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection684:
+; O0-NEXT:    jmp .LBB239_1
+; O0-NEXT:  .LBB239_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection685:
+; O0-NEXT:    andl $42, %ecx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection686:
+; O0-NEXT:    xorl %ecx, %ecx
+; O0-NEXT:  .Lpcsection687:
+; O0-NEXT:    # kill: def $rcx killed $ecx
+; O0-NEXT:  .Lpcsection688:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection689:
+; O0-NEXT:    jne .LBB239_1
+; O0-NEXT:    jmp .LBB239_2
+; O0-NEXT:  .LBB239_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_and_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection556:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection557:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB239_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection558:
+; O1-NEXT:    andl $42, %ebx
+; O1-NEXT:  .Lpcsection559:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection560:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection561:
+; O1-NEXT:    jne .LBB239_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_and_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection556:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection557:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB239_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection558:
+; O2-NEXT:    andl $42, %ebx
+; O2-NEXT:  .Lpcsection559:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection560:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection561:
+; O2-NEXT:    jne .LBB239_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_and_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection556:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection557:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB239_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection558:
+; O3-NEXT:    andl $42, %ebx
+; O3-NEXT:  .Lpcsection559:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection560:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection561:
+; O3-NEXT:    jne .LBB239_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw and ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_or_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_or_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection690:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection691:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection692:
+; O0-NEXT:    jmp .LBB240_1
+; O0-NEXT:  .LBB240_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection693:
+; O0-NEXT:    orq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection694:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection695:
+; O0-NEXT:    jne .LBB240_1
+; O0-NEXT:    jmp .LBB240_2
+; O0-NEXT:  .LBB240_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_or_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection562:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection563:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB240_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection564:
+; O1-NEXT:    orq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection565:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection566:
+; O1-NEXT:    jne .LBB240_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_or_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection562:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection563:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB240_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection564:
+; O2-NEXT:    orq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection565:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection566:
+; O2-NEXT:    jne .LBB240_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_or_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection562:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection563:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB240_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection564:
+; O3-NEXT:    orq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection565:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection566:
+; O3-NEXT:    jne .LBB240_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw or ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_xor_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_xor_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection696:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection697:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection698:
+; O0-NEXT:    jmp .LBB241_1
+; O0-NEXT:  .LBB241_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movq %rax, %rbx
+; O0-NEXT:  .Lpcsection699:
+; O0-NEXT:    xorq $42, %rbx
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection700:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection701:
+; O0-NEXT:    jne .LBB241_1
+; O0-NEXT:    jmp .LBB241_2
+; O0-NEXT:  .LBB241_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_xor_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection567:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection568:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB241_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movq %rax, %rbx
+; O1-NEXT:  .Lpcsection569:
+; O1-NEXT:    xorq $42, %rbx
+; O1-NEXT:    movq %rdx, %rcx
+; O1-NEXT:  .Lpcsection570:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection571:
+; O1-NEXT:    jne .LBB241_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_xor_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection567:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection568:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB241_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movq %rax, %rbx
+; O2-NEXT:  .Lpcsection569:
+; O2-NEXT:    xorq $42, %rbx
+; O2-NEXT:    movq %rdx, %rcx
+; O2-NEXT:  .Lpcsection570:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection571:
+; O2-NEXT:    jne .LBB241_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_xor_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection567:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection568:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB241_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movq %rax, %rbx
+; O3-NEXT:  .Lpcsection569:
+; O3-NEXT:    xorq $42, %rbx
+; O3-NEXT:    movq %rdx, %rcx
+; O3-NEXT:  .Lpcsection570:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection571:
+; O3-NEXT:    jne .LBB241_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw xor ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_nand_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_nand_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection702:
+; O0-NEXT:    movq (%rdi), %rax
+; O0-NEXT:  .Lpcsection703:
+; O0-NEXT:    movq 8(%rdi), %rdx
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection704:
+; O0-NEXT:    jmp .LBB242_1
+; O0-NEXT:  .LBB242_1: # %atomicrmw.start
+; O0-NEXT:    # =>This Inner Loop Header: Depth=1
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:  .Lpcsection705:
+; O0-NEXT:    notl %ecx
+; O0-NEXT:  .Lpcsection706:
+; O0-NEXT:    # implicit-def: $rbx
+; O0-NEXT:    movl %ecx, %ebx
+; O0-NEXT:  .Lpcsection707:
+; O0-NEXT:    orq $-43, %rbx
+; O0-NEXT:  .Lpcsection708:
+; O0-NEXT:    movq $-1, %rcx
+; O0-NEXT:  .Lpcsection709:
+; O0-NEXT:    lock cmpxchg16b (%rsi)
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection710:
+; O0-NEXT:    jne .LBB242_1
+; O0-NEXT:    jmp .LBB242_2
+; O0-NEXT:  .LBB242_2: # %atomicrmw.end
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_nand_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection572:
+; O1-NEXT:    movq (%rdi), %rax
+; O1-NEXT:  .Lpcsection573:
+; O1-NEXT:    movq 8(%rdi), %rdx
+; O1-NEXT:  .Lpcsection574:
+; O1-NEXT:    movq $-1, %rcx
+; O1-NEXT:    .p2align 4, 0x90
+; O1-NEXT:  .LBB242_1: # %atomicrmw.start
+; O1-NEXT:    # =>This Inner Loop Header: Depth=1
+; O1-NEXT:    movl %eax, %ebx
+; O1-NEXT:  .Lpcsection575:
+; O1-NEXT:    notl %ebx
+; O1-NEXT:  .Lpcsection576:
+; O1-NEXT:    orq $-43, %rbx
+; O1-NEXT:  .Lpcsection577:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection578:
+; O1-NEXT:    jne .LBB242_1
+; O1-NEXT:  # %bb.2: # %atomicrmw.end
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_nand_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection572:
+; O2-NEXT:    movq (%rdi), %rax
+; O2-NEXT:  .Lpcsection573:
+; O2-NEXT:    movq 8(%rdi), %rdx
+; O2-NEXT:  .Lpcsection574:
+; O2-NEXT:    movq $-1, %rcx
+; O2-NEXT:    .p2align 4, 0x90
+; O2-NEXT:  .LBB242_1: # %atomicrmw.start
+; O2-NEXT:    # =>This Inner Loop Header: Depth=1
+; O2-NEXT:    movl %eax, %ebx
+; O2-NEXT:  .Lpcsection575:
+; O2-NEXT:    notl %ebx
+; O2-NEXT:  .Lpcsection576:
+; O2-NEXT:    orq $-43, %rbx
+; O2-NEXT:  .Lpcsection577:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection578:
+; O2-NEXT:    jne .LBB242_1
+; O2-NEXT:  # %bb.2: # %atomicrmw.end
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_nand_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection572:
+; O3-NEXT:    movq (%rdi), %rax
+; O3-NEXT:  .Lpcsection573:
+; O3-NEXT:    movq 8(%rdi), %rdx
+; O3-NEXT:  .Lpcsection574:
+; O3-NEXT:    movq $-1, %rcx
+; O3-NEXT:    .p2align 4, 0x90
+; O3-NEXT:  .LBB242_1: # %atomicrmw.start
+; O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; O3-NEXT:    movl %eax, %ebx
+; O3-NEXT:  .Lpcsection575:
+; O3-NEXT:    notl %ebx
+; O3-NEXT:  .Lpcsection576:
+; O3-NEXT:    orq $-43, %rbx
+; O3-NEXT:  .Lpcsection577:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection578:
+; O3-NEXT:    jne .LBB242_1
+; O3-NEXT:  # %bb.2: # %atomicrmw.end
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = atomicrmw nand ptr %a, i128 42 seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_cas_monotonic(ptr %a) {
+; O0-LABEL: atomic128_cas_monotonic:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection711:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection712:
+; O0-NEXT:    movl $42, %eax
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection713:
+; O0-NEXT:    movl $1, %ebx
+; O0-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection714:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection715:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection716:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection717:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection718:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_cas_monotonic:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection579:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection580:
+; O1-NEXT:    movl $1, %ebx
+; O1-NEXT:  .Lpcsection581:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection582:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection583:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection584:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection585:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection586:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection587:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection588:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection589:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection590:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection591:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_cas_monotonic:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection579:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection580:
+; O2-NEXT:    movl $1, %ebx
+; O2-NEXT:  .Lpcsection581:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection582:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection583:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection584:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection585:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection586:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection587:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection588:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection589:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection590:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection591:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_cas_monotonic:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection579:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection580:
+; O3-NEXT:    movl $1, %ebx
+; O3-NEXT:  .Lpcsection581:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection582:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection583:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection584:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection585:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection586:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection587:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection588:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection589:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection590:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection591:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = cmpxchg ptr %a, i128 42, i128 1 monotonic monotonic, align 16, !pcsections !0
+  %y = cmpxchg ptr %a, i128 42, i128 1 monotonic acquire, align 16, !pcsections !0
+  %z = cmpxchg ptr %a, i128 42, i128 1 monotonic seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_cas_acquire(ptr %a) {
+; O0-LABEL: atomic128_cas_acquire:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection719:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection720:
+; O0-NEXT:    movl $42, %eax
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection721:
+; O0-NEXT:    movl $1, %ebx
+; O0-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection722:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection723:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection724:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection725:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection726:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_cas_acquire:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection592:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection593:
+; O1-NEXT:    movl $1, %ebx
+; O1-NEXT:  .Lpcsection594:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection595:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection596:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection597:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection598:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection599:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection600:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection601:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection602:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection603:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection604:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_cas_acquire:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection592:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection593:
+; O2-NEXT:    movl $1, %ebx
+; O2-NEXT:  .Lpcsection594:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection595:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection596:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection597:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection598:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection599:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection600:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection601:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection602:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection603:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection604:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_cas_acquire:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection592:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection593:
+; O3-NEXT:    movl $1, %ebx
+; O3-NEXT:  .Lpcsection594:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection595:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection596:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection597:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection598:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection599:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection600:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection601:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection602:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection603:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection604:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = cmpxchg ptr %a, i128 42, i128 1 acquire monotonic, align 16, !pcsections !0
+  %y = cmpxchg ptr %a, i128 42, i128 1 acquire acquire, align 16, !pcsections !0
+  %z = cmpxchg ptr %a, i128 42, i128 1 acquire seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_cas_release(ptr %a) {
+; O0-LABEL: atomic128_cas_release:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection727:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection728:
+; O0-NEXT:    movl $42, %eax
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection729:
+; O0-NEXT:    movl $1, %ebx
+; O0-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection730:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection731:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection732:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection733:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection734:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_cas_release:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection605:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection606:
+; O1-NEXT:    movl $1, %ebx
+; O1-NEXT:  .Lpcsection607:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection608:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection609:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection610:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection611:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection612:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection613:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection614:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection615:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection616:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection617:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_cas_release:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection605:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection606:
+; O2-NEXT:    movl $1, %ebx
+; O2-NEXT:  .Lpcsection607:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection608:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection609:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection610:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection611:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection612:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection613:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection614:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection615:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection616:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection617:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_cas_release:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection605:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection606:
+; O3-NEXT:    movl $1, %ebx
+; O3-NEXT:  .Lpcsection607:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection608:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection609:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection610:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection611:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection612:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection613:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection614:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection615:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection616:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection617:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = cmpxchg ptr %a, i128 42, i128 1 release monotonic, align 16, !pcsections !0
+  %y = cmpxchg ptr %a, i128 42, i128 1 release acquire, align 16, !pcsections !0
+  %z = cmpxchg ptr %a, i128 42, i128 1 release seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_cas_acq_rel(ptr %a) {
+; O0-LABEL: atomic128_cas_acq_rel:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection735:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection736:
+; O0-NEXT:    movl $42, %eax
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection737:
+; O0-NEXT:    movl $1, %ebx
+; O0-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection738:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection739:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection740:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection741:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection742:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $1, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_cas_acq_rel:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection618:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection619:
+; O1-NEXT:    movl $1, %ebx
+; O1-NEXT:  .Lpcsection620:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection621:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection622:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection623:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection624:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection625:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection626:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection627:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection628:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection629:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection630:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $1, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_cas_acq_rel:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection618:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection619:
+; O2-NEXT:    movl $1, %ebx
+; O2-NEXT:  .Lpcsection620:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection621:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection622:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection623:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection624:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection625:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection626:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection627:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection628:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection629:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection630:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $1, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_cas_acq_rel:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection618:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection619:
+; O3-NEXT:    movl $1, %ebx
+; O3-NEXT:  .Lpcsection620:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection621:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection622:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection623:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection624:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection625:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection626:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection627:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection628:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection629:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection630:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $1, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = cmpxchg ptr %a, i128 42, i128 1 acq_rel monotonic, align 16, !pcsections !0
+  %y = cmpxchg ptr %a, i128 42, i128 1 acq_rel acquire, align 16, !pcsections !0
+  %z = cmpxchg ptr %a, i128 42, i128 1 acq_rel seq_cst, align 16, !pcsections !0
+  store volatile i64 1, ptr @foo, align 8
+  ret void
+}
+
+define void @atomic128_cas_seq_cst(ptr %a) {
+; O0-LABEL: atomic128_cas_seq_cst:
+; O0:       # %bb.0: # %entry
+; O0-NEXT:    pushq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 16
+; O0-NEXT:    .cfi_offset %rbx, -16
+; O0-NEXT:    movq foo(%rip), %rax
+; O0-NEXT:  .Lpcsection743:
+; O0-NEXT:    xorl %eax, %eax
+; O0-NEXT:    movl %eax, %ecx
+; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection744:
+; O0-NEXT:    movl $42, %eax
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:  .Lpcsection745:
+; O0-NEXT:    movl $1, %ebx
+; O0-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection746:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection747:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection748:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; O0-NEXT:  .Lpcsection749:
+; O0-NEXT:    # kill: def $rsi killed $rax
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT:    movq %rcx, %rdx
+; O0-NEXT:  .Lpcsection750:
+; O0-NEXT:    lock cmpxchg16b (%rdi)
+; O0-NEXT:    movq $3, foo
+; O0-NEXT:    popq %rbx
+; O0-NEXT:    .cfi_def_cfa_offset 8
+; O0-NEXT:    retq
+;
+; O1-LABEL: atomic128_cas_seq_cst:
+; O1:       # %bb.0: # %entry
+; O1-NEXT:    pushq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 16
+; O1-NEXT:    .cfi_offset %rbx, -16
+; O1-NEXT:    movq foo(%rip), %rax
+; O1-NEXT:  .Lpcsection631:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection632:
+; O1-NEXT:    movl $1, %ebx
+; O1-NEXT:  .Lpcsection633:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection634:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection635:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection636:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection637:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection638:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection639:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:  .Lpcsection640:
+; O1-NEXT:    movl $42, %eax
+; O1-NEXT:  .Lpcsection641:
+; O1-NEXT:    xorl %edx, %edx
+; O1-NEXT:  .Lpcsection642:
+; O1-NEXT:    xorl %ecx, %ecx
+; O1-NEXT:  .Lpcsection643:
+; O1-NEXT:    lock cmpxchg16b (%rdi)
+; O1-NEXT:    movq $3, foo(%rip)
+; O1-NEXT:    popq %rbx
+; O1-NEXT:    .cfi_def_cfa_offset 8
+; O1-NEXT:    retq
+;
+; O2-LABEL: atomic128_cas_seq_cst:
+; O2:       # %bb.0: # %entry
+; O2-NEXT:    pushq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 16
+; O2-NEXT:    .cfi_offset %rbx, -16
+; O2-NEXT:    movq foo(%rip), %rax
+; O2-NEXT:  .Lpcsection631:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection632:
+; O2-NEXT:    movl $1, %ebx
+; O2-NEXT:  .Lpcsection633:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection634:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection635:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection636:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection637:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection638:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection639:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:  .Lpcsection640:
+; O2-NEXT:    movl $42, %eax
+; O2-NEXT:  .Lpcsection641:
+; O2-NEXT:    xorl %edx, %edx
+; O2-NEXT:  .Lpcsection642:
+; O2-NEXT:    xorl %ecx, %ecx
+; O2-NEXT:  .Lpcsection643:
+; O2-NEXT:    lock cmpxchg16b (%rdi)
+; O2-NEXT:    movq $3, foo(%rip)
+; O2-NEXT:    popq %rbx
+; O2-NEXT:    .cfi_def_cfa_offset 8
+; O2-NEXT:    retq
+;
+; O3-LABEL: atomic128_cas_seq_cst:
+; O3:       # %bb.0: # %entry
+; O3-NEXT:    pushq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 16
+; O3-NEXT:    .cfi_offset %rbx, -16
+; O3-NEXT:    movq foo(%rip), %rax
+; O3-NEXT:  .Lpcsection631:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection632:
+; O3-NEXT:    movl $1, %ebx
+; O3-NEXT:  .Lpcsection633:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection634:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection635:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection636:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection637:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection638:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection639:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:  .Lpcsection640:
+; O3-NEXT:    movl $42, %eax
+; O3-NEXT:  .Lpcsection641:
+; O3-NEXT:    xorl %edx, %edx
+; O3-NEXT:  .Lpcsection642:
+; O3-NEXT:    xorl %ecx, %ecx
+; O3-NEXT:  .Lpcsection643:
+; O3-NEXT:    lock cmpxchg16b (%rdi)
+; O3-NEXT:    movq $3, foo(%rip)
+; O3-NEXT:    popq %rbx
+; O3-NEXT:    .cfi_def_cfa_offset 8
+; O3-NEXT:    retq
+entry:
+  load volatile i64, ptr @foo, align 8
+  %x = cmpxchg ptr %a, i128 42, i128 1 seq_cst monotonic, align 16, !pcsections !0
+  %y = cmpxchg ptr %a, i128 42, i128 1 seq_cst acquire, align 16, !pcsections !0
+  %z = cmpxchg ptr %a, i128 42, i128 1 seq_cst seq_cst, align 16, !pcsections !0
+  store volatile i64 3, ptr @foo, align 8
+  ret void
+}
+
 !0 = !{!"somesection"}