Index: include/llvm/Target/TargetSelectionDAG.td
===================================================================
--- include/llvm/Target/TargetSelectionDAG.td
+++ include/llvm/Target/TargetSelectionDAG.td
@@ -1197,13 +1197,16 @@
   let ScalarMemoryVT = i16;
 }
 
+// TODO: These need renamed to simple_store/simple_load and then split
+// into a volatile/atomic/ordered flavors so that respective transforms
+// can pick the right combination.
 def nonvolatile_load : PatFrag<(ops node:$ptr),
                                (load node:$ptr), [{
-  return !cast<LoadSDNode>(N)->isVolatile();
+  return cast<LoadSDNode>(N)->isSimple();
 }]>;
 def nonvolatile_store : PatFrag<(ops node:$val, node:$ptr),
                                 (store node:$val, node:$ptr), [{
-  return !cast<StoreSDNode>(N)->isVolatile();
+  return cast<StoreSDNode>(N)->isSimple();
 }]>;
 
 // nontemporal store fragments.
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4898,8 +4898,8 @@
     return true;
   }
 
-  // Do not change the width of a volatile load.
-  if (LoadN->isVolatile())
+  // Do not change the width of a volatile or atomic loads.
+  if (!LoadN->isSimple())
     return false;
 
   // Do not generate loads of non-round integer types since these can
@@ -4931,8 +4931,8 @@
   if (!MemVT.isRound())
     return false;
 
-  // Don't change the width of a volatile load.
-  if (LDST->isVolatile())
+  // Don't change the width of a volatile or atomic loads.
+  if (!LDST->isSimple())
     return false;
 
   // Verify that we are actually reducing a load width here.
@@ -5519,7 +5519,7 @@
     unsigned MemBitSize = MemVT.getScalarSizeInBits();
     APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
     if (DAG.MaskedValueIsZero(N1, ExtBits) &&
-        ((!LegalOperations && !LN0->isVolatile()) ||
+        ((!LegalOperations && LN0->isSimple()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad =
           DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
@@ -6613,7 +6613,7 @@
                                  Depth + 1);
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
-    if (L->isVolatile() || L->isIndexed())
+    if (!L->isSimple() || L->isIndexed())
       return None;
 
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
@@ -6702,8 +6702,9 @@
   SDValue Chain;
   SmallVector<StoreSDNode *, 8> Stores;
   for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
+    // TODO: allow unordered atomics when wider type is legal (see D66309)
     if (Store->getMemoryVT() != MVT::i8 ||
-        Store->isVolatile() || Store->isIndexed())
+        !Store->isSimple() || Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
@@ -6914,7 +6915,8 @@
       return SDValue();
 
     LoadSDNode *L = P->Load;
-    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+    assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
+           !L->isIndexed() &&
            "Must be enforced by calculateByteProvider");
     assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
 
@@ -9244,8 +9246,9 @@
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
   if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
-      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
-      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+      !N0.hasOneUse() || !LN0->isSimple() ||
+      !DstVT.isVector() || !DstVT.isPow2VectorType() ||
+      !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
     return SDValue();
 
   SmallVector<SDNode *, 4> SetCCs;
@@ -9446,7 +9449,8 @@
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
-  if ((LegalOperations || LN0->isVolatile() || VT.isVector()) &&
+  if ((LegalOperations || !LN0->isSimple() ||
+       VT.isVector()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return SDValue();
 
@@ -9471,7 +9475,7 @@
   if (!ISD::isNON_EXTLoad(N0.getNode()) ||
       !ISD::isUNINDEXEDLoad(N0.getNode()) ||
       ((LegalOperations || VT.isVector() ||
-        cast<LoadSDNode>(N0)->isVolatile()) &&
+        !cast<LoadSDNode>(N0)->isSimple()) &&
        !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
     return {};
 
@@ -10547,7 +10551,7 @@
   if (ISD::isEXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -10564,7 +10568,7 @@
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -10791,7 +10795,7 @@
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      if (!LN0->isVolatile() &&
+      if (LN0->isSimple() &&
           LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
         SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
                                          VT, LN0->getChain(), LN0->getBasePtr(),
@@ -11085,7 +11089,7 @@
       // memory accesses. We don't care if the original type was legal or not
       // as we assume software couldn't rely on the number of accesses of an
       // illegal type.
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
        TLI.isOperationLegal(ISD::LOAD, VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
@@ -14025,11 +14029,12 @@
 }
 
 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
-  if (OptLevel == CodeGenOpt::None || LD->isVolatile())
+  if (OptLevel == CodeGenOpt::None || !LD->isSimple())
     return SDValue();
   SDValue Chain = LD->getOperand(0);
   StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
-  if (!ST || ST->isVolatile())
+  // TODO: relax this restriction for unordered atomics (see D66309)
+  if (!ST || !ST->isSimple())
     return SDValue();
 
   EVT LDType = LD->getValueType(0);
@@ -14128,7 +14133,8 @@
   // If load is not volatile and there are no uses of the loaded value (and
   // the updated indexed value in case of indexed loads), change uses of the
   // chain value into uses of the chain input (i.e. delete the dead load).
-  if (!LD->isVolatile()) {
+  // TODO: Allow this for unordered atomics (see D66309)
+  if (LD->isSimple()) {
     if (N->getValueType(1) == MVT::Other) {
       // Unindexed loads.
       if (!N->hasAnyUseOfValue(0)) {
@@ -14699,7 +14705,7 @@
     return false;
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+  if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
       !LD->getValueType(0).isInteger())
     return false;
 
@@ -14930,7 +14936,7 @@
 /// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
-  if (ST->isVolatile())
+  if (!ST->isSimple())
     return SDValue();
 
   SDValue Chain = ST->getChain();
@@ -15386,14 +15392,16 @@
     // Loads must only have one use.
     if (!Ld->hasNUsesOfValue(1, 0))
       return;
-    // The memory operands must not be volatile/indexed.
-    if (Ld->isVolatile() || Ld->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: may be able to relax for unordered atomics (see D66309)
+    if (!Ld->isSimple() || Ld->isIndexed())
       return;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
-    // The memory operands must not be volatile/indexed.
-    if (Other->isVolatile() || Other->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: may be able to relax for unordered atomics (see D66309)
+    if (!Other->isSimple() ||  Other->isIndexed())
       return false;
     // Don't mix temporal stores with non-temporal stores.
     if (St->isNonTemporal() != Other->isNonTemporal())
@@ -15413,8 +15421,10 @@
         // Loads must only have one use.
         if (!OtherLd->hasNUsesOfValue(1, 0))
           return false;
-        // The memory operands must not be volatile/indexed.
-        if (OtherLd->isVolatile() || OtherLd->isIndexed())
+        // The memory operands must not be volatile/indexed/atomic.
+        // TODO: may be able to relax for unordered atomics (see D66309)
+        if (!OtherLd->isSimple() ||
+            OtherLd->isIndexed())
           return false;
         // Don't mix temporal loads with non-temporal loads.
         if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
@@ -16157,7 +16167,7 @@
   case MVT::ppcf128:
     return SDValue();
   case MVT::f32:
-    if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       ;
       Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
@@ -16169,7 +16179,7 @@
     return SDValue();
   case MVT::f64:
     if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
-         !ST->isVolatile()) ||
+         ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
       ;
       Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
@@ -16178,7 +16188,7 @@
                           Ptr, ST->getMemOperand());
     }
 
-    if (!ST->isVolatile() &&
+    if (ST->isSimple() &&
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       // Many FP stores are not made apparent until after legalize, e.g. for
       // argument passing.  Since this is so common, custom legalize the
@@ -16225,7 +16235,8 @@
     // memory accesses. We don't care if the original type was legal or not
     // as we assume software couldn't rely on the number of accesses of an
     // illegal type.
-    if (((!LegalOperations && !ST->isVolatile()) ||
+    // TODO: may be able to relax for unordered atomics (see D66309)
+    if (((!LegalOperations && ST->isSimple()) ||
          TLI.isOperationLegal(ISD::STORE, SVT)) &&
         TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
                                      DAG, *ST->getMemOperand())) {
@@ -16306,9 +16317,10 @@
 
   // If this is a load followed by a store to the same location, then the store
   // is dead/noop.
+  // TODO: can relax for unordered atomics (see D66309)
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
     if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
-        ST->isUnindexed() && !ST->isVolatile() &&
+        ST->isUnindexed() && ST->isSimple() &&
         // There can't be any side effects between the load and store, such as
         // a call or store.
         Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
@@ -16317,9 +16329,10 @@
     }
   }
 
+  // TODO: can relax for unordered atomics (see D66309)
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
-        !ST1->isVolatile()) {
+    if (ST->isUnindexed() && ST->isSimple() &&
+        ST1->isUnindexed() && ST1->isSimple()) {
       if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
           ST->getMemoryVT() == ST1->getMemoryVT()) {
         // If this is a store followed by a store with the same value to the
@@ -16448,7 +16461,8 @@
       break;
     case ISD::STORE: {
       StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
-      if (ST->isVolatile() || ST->isIndexed())
+      // TODO: can relax for unordered atomics (see D66309)
+      if (!ST->isSimple() || ST->isIndexed())
         continue;
       const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
       // If we store purely within object bounds just before its lifetime ends,
@@ -16757,7 +16771,7 @@
 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
                                                   SDValue EltNo,
                                                   LoadSDNode *OriginalLoad) {
-  assert(!OriginalLoad->isVolatile());
+  assert(OriginalLoad->isSimple());
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
@@ -17065,7 +17079,7 @@
       ISD::isNormalLoad(VecOp.getNode()) &&
       !Index->hasPredecessor(VecOp.getNode())) {
     auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
-    if (VecLoad && !VecLoad->isVolatile())
+    if (VecLoad && VecLoad->isSimple())
       return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
   }
 
@@ -17124,7 +17138,7 @@
 
   // Make sure we found a non-volatile load and the extractelement is
   // the only use.
-  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
     return SDValue();
 
   // If Idx was -1 above, Elt is going to be -1, so just return undef.
@@ -18270,7 +18284,8 @@
 
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx)
+  if (!Ld || Ld->getExtensionType() || !Ld->isSimple() ||
+      !ExtIdx)
     return SDValue();
 
   // Allow targets to opt-out.
@@ -19843,7 +19858,9 @@
     // Token chains must be identical.
     if (LHS.getOperand(0) != RHS.getOperand(0) ||
         // Do not let this transformation reduce the number of volatile loads.
-        LLD->isVolatile() || RLD->isVolatile() ||
+        // Be conservative for atomics for the moment
+        // TODO: This does appear to be legal for unordered atomics (see D66309)
+        !LLD->isSimple() || !RLD->isSimple() ||
         // FIXME: If either is a pre/post inc/dec load,
         // we'd need to split out the address adjustment.
         LLD->isIndexed() || RLD->isIndexed() ||
@@ -20521,6 +20538,7 @@
 
   struct MemUseCharacteristics {
     bool IsVolatile;
+    bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
     Optional<int64_t> NumBytes;
@@ -20536,18 +20554,20 @@
                      : (LSN->getAddressingMode() == ISD::PRE_DEC)
                            ? -1 * C->getSExtValue()
                            : 0;
-      return {LSN->isVolatile(), LSN->getBasePtr(), Offset /*base offset*/,
+      return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
+              Offset /*base offset*/,
               Optional<int64_t>(LSN->getMemoryVT().getStoreSize()),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
-      return {false /*isVolatile*/, LN->getOperand(1),
+      return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
               (LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
                                 : Optional<int64_t>(),
               (MachineMemOperand *)nullptr};
     // Default.
-    return {false /*isvolatile*/, SDValue(), (int64_t)0 /*offset*/,
+    return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
+            (int64_t)0 /*offset*/,
             Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
   };
 
@@ -20563,6 +20583,11 @@
   if (MUC0.IsVolatile && MUC1.IsVolatile)
     return true;
 
+  // Be conservative about atomics for the moment
+  // TODO: this is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
   if (MUC0.MMO && MUC1.MMO) {
     if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
         (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
@@ -20644,7 +20669,8 @@
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
-  const bool IsLoad = isa<LoadSDNode>(N) && !cast<LoadSDNode>(N)->isVolatile();
+  // TODO: relax aliasing for unordered atomics (see D66309)
+  const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -20660,8 +20686,9 @@
     case ISD::LOAD:
     case ISD::STORE: {
       // Get alias information for C.
+      // TODO: relax aliasing for unordered atomics (see D66309)
       bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
-                      !cast<LSBaseSDNode>(C.getNode())->isVolatile();
+                      cast<LSBaseSDNode>(C.getNode())->isSimple();
       if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) {
         // Look further up the chain.
         C = C.getOperand(0);
@@ -20816,7 +20843,8 @@
     // If the chain has more than one use, then we can't reorder the mem ops.
     if (!SDValue(Chain, 0)->hasOneUse())
       break;
-    if (Chain->isVolatile() || Chain->isIndexed())
+    // TODO: relax for unordered atomics (see D66309)
+    if (!Chain->isSimple() || Chain->isIndexed())
       break;
 
     // Find the base pointer and offset for this memory node.
Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4772,7 +4772,7 @@
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
+  unsigned LdAlign = (!LD->isSimple()) ? 0 : Align; // Allow wider loads.
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8974,7 +8974,7 @@
 
   // Loads don't have side effects, look through them.
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
-    if (!Ld->isVolatile())
+    if (Ld->isUnordered())
       return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
   }
   return false;
@@ -9212,6 +9212,9 @@
                                                   int Dist) const {
   if (LD->isVolatile() || Base->isVolatile())
     return false;
+  // TODO: probably too restrictive for atomics, revisit
+  if (!LD->isSimple())
+    return false;
   if (LD->isIndexed() || Base->isIndexed())
     return false;
   if (LD->getChain() != Base->getChain())
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3224,7 +3224,7 @@
       LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
       APInt bestMask;
       unsigned bestWidth = 0, bestOffset = 0;
-      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+      if (Lod->isSimple() && Lod->isUnindexed()) {
         unsigned origWidth = N0.getValueSizeInBits();
         unsigned maskWidth = origWidth;
         // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
Index: test/CodeGen/X86/atomic-unordered.ll
===================================================================
--- test/CodeGen/X86/atomic-unordered.ll
+++ test/CodeGen/X86/atomic-unordered.ll
@@ -604,16 +604,11 @@
 
 ; Legal if wider type is also atomic (TODO)
 define void @widen_zero_init(i32* %p0, i32 %v1, i32 %v2) {
-; CHECK-NOX-LABEL: widen_zero_init:
-; CHECK-NOX:       # %bb.0:
-; CHECK-NOX-NEXT:    movl $0, (%rdi)
-; CHECK-NOX-NEXT:    movl $0, 4(%rdi)
-; CHECK-NOX-NEXT:    retq
-;
-; CHECK-EX-LABEL: widen_zero_init:
-; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movq $0, (%rdi)
-; CHECK-EX-NEXT:    retq
+; CHECK-LABEL: widen_zero_init:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $0, (%rdi)
+; CHECK-NEXT:    movl $0, 4(%rdi)
+; CHECK-NEXT:    retq
   %p1 = getelementptr i32, i32* %p0, i64 1
   store atomic i32 0, i32* %p0 unordered, align 8
   store atomic i32 0, i32* %p1 unordered, align 4
@@ -622,16 +617,11 @@
 
 ; Not legal to widen due to alignment restriction
 define void @widen_zero_init_unaligned(i32* %p0, i32 %v1, i32 %v2) {
-; CHECK-NOX-LABEL: widen_zero_init_unaligned:
-; CHECK-NOX:       # %bb.0:
-; CHECK-NOX-NEXT:    movl $0, (%rdi)
-; CHECK-NOX-NEXT:    movl $0, 4(%rdi)
-; CHECK-NOX-NEXT:    retq
-;
-; CHECK-EX-LABEL: widen_zero_init_unaligned:
-; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movq $0, (%rdi)
-; CHECK-EX-NEXT:    retq
+; CHECK-LABEL: widen_zero_init_unaligned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $0, (%rdi)
+; CHECK-NEXT:    movl $0, 4(%rdi)
+; CHECK-NEXT:    retq
   %p1 = getelementptr i32, i32* %p0, i64 1
   store atomic i32 0, i32* %p0 unordered, align 4
   store atomic i32 0, i32* %p1 unordered, align 4
@@ -1449,7 +1439,7 @@
 ;
 ; CHECK-EX-LABEL: load_fold_shl3:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movb (%rsi), %al
+; CHECK-EX-NEXT:    movq (%rsi), %rax
 ; CHECK-EX-NEXT:    shlxq %rax, (%rdi), %rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p1 unordered, align 8
@@ -1510,7 +1500,7 @@
 ;
 ; CHECK-EX-LABEL: load_fold_lshr3:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movb (%rsi), %al
+; CHECK-EX-NEXT:    movq (%rsi), %rax
 ; CHECK-EX-NEXT:    shrxq %rax, (%rdi), %rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p1 unordered, align 8
@@ -1571,7 +1561,7 @@
 ;
 ; CHECK-EX-LABEL: load_fold_ashr3:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movb (%rsi), %al
+; CHECK-EX-NEXT:    movq (%rsi), %rax
 ; CHECK-EX-NEXT:    sarxq %rax, (%rdi), %rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p1 unordered, align 8
@@ -2694,16 +2684,11 @@
 
 ; Legal to reduce the load width (TODO)
 define i32 @fold_trunc(i64* %p) {
-; CHECK-NOX-LABEL: fold_trunc:
-; CHECK-NOX:       # %bb.0:
-; CHECK-NOX-NEXT:    movq (%rdi), %rax
-; CHECK-NOX-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK-NOX-NEXT:    retq
-;
-; CHECK-EX-LABEL: fold_trunc:
-; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movl (%rdi), %eax
-; CHECK-EX-NEXT:    retq
+; CHECK-LABEL: fold_trunc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %ret = trunc i64 %v to i32
   ret i32 %ret
@@ -2727,8 +2712,9 @@
 ;
 ; CHECK-EX-LABEL: fold_trunc_add:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movl %esi, %eax
-; CHECK-EX-NEXT:    addl (%rdi), %eax
+; CHECK-EX-NEXT:    movq (%rdi), %rax
+; CHECK-EX-NEXT:    addl %esi, %eax
+; CHECK-EX-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %trunc = trunc i64 %v to i32
@@ -2754,8 +2740,9 @@
 ;
 ; CHECK-EX-LABEL: fold_trunc_and:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movl %esi, %eax
-; CHECK-EX-NEXT:    andl (%rdi), %eax
+; CHECK-EX-NEXT:    movq (%rdi), %rax
+; CHECK-EX-NEXT:    andl %esi, %eax
+; CHECK-EX-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %trunc = trunc i64 %v to i32
@@ -2781,8 +2768,9 @@
 ;
 ; CHECK-EX-LABEL: fold_trunc_or:
 ; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movl %esi, %eax
-; CHECK-EX-NEXT:    orl (%rdi), %eax
+; CHECK-EX-NEXT:    movq (%rdi), %rax
+; CHECK-EX-NEXT:    orl %esi, %eax
+; CHECK-EX-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %trunc = trunc i64 %v to i32
@@ -2864,17 +2852,11 @@
 
 ; Legal to forward (TODO)
 define i64 @store_forward(i64* %p, i64 %v) {
-; CHECK-NOX-LABEL: store_forward:
-; CHECK-NOX:       # %bb.0:
-; CHECK-NOX-NEXT:    movq %rsi, (%rdi)
-; CHECK-NOX-NEXT:    movq (%rdi), %rax
-; CHECK-NOX-NEXT:    retq
-;
-; CHECK-EX-LABEL: store_forward:
-; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movq %rsi, %rax
-; CHECK-EX-NEXT:    movq %rsi, (%rdi)
-; CHECK-EX-NEXT:    retq
+; CHECK-LABEL: store_forward:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rsi, (%rdi)
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
   store atomic i64 %v, i64* %p unordered, align 8
   %ret = load atomic i64, i64* %p unordered, align 8
   ret i64 %ret
@@ -2894,16 +2876,11 @@
 
 ; Legal to kill (TODO)
 define void @dead_store(i64* %p, i64 %v) {
-; CHECK-NOX-LABEL: dead_store:
-; CHECK-NOX:       # %bb.0:
-; CHECK-NOX-NEXT:    movq $0, (%rdi)
-; CHECK-NOX-NEXT:    movq %rsi, (%rdi)
-; CHECK-NOX-NEXT:    retq
-;
-; CHECK-EX-LABEL: dead_store:
-; CHECK-EX:       # %bb.0:
-; CHECK-EX-NEXT:    movq %rsi, (%rdi)
-; CHECK-EX-NEXT:    retq
+; CHECK-LABEL: dead_store:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $0, (%rdi)
+; CHECK-NEXT:    movq %rsi, (%rdi)
+; CHECK-NEXT:    retq
   store atomic i64 0, i64* %p unordered, align 8
   store atomic i64 %v, i64* %p unordered, align 8
   ret void