diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -626,6 +626,20 @@
     setSubclassData<FailureOrderingField>(Ordering);
   }
 
+  /// Returns a single ordering which is at least as strong as both the
+  /// success and failure orderings for this cmpxchg.
+  AtomicOrdering getMergedOrdering() const {
+    if (getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+      return AtomicOrdering::SequentiallyConsistent;
+    if (getFailureOrdering() == AtomicOrdering::Acquire) {
+      if (getSuccessOrdering() == AtomicOrdering::Monotonic)
+        return AtomicOrdering::Acquire;
+      if (getSuccessOrdering() == AtomicOrdering::Release)
+        return AtomicOrdering::AcquireRelease;
+    }
+    return getSuccessOrdering();
+  }
+
   /// Returns the synchronization scope ID of this cmpxchg instruction.
   SyncScope::ID getSyncScopeID() const {
     return SSID;
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -236,12 +236,13 @@
                  TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
                      TargetLoweringBase::AtomicExpansionKind::None &&
                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
-                  isAcquireOrStronger(CASI->getSuccessOrdering()))) {
+                  isAcquireOrStronger(CASI->getSuccessOrdering()) ||
+                  isAcquireOrStronger(CASI->getFailureOrdering()))) {
         // If a compare and swap is lowered to LL/SC, we can do smarter fence
         // insertion, with a stronger one on the success path than on the
         // failure path. As a result, fence insertion is directly done by
         // expandAtomicCmpXchg in that case.
-        FenceOrdering = CASI->getSuccessOrdering();
+        FenceOrdering = CASI->getMergedOrdering();
         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
       }
@@ -1052,7 +1053,7 @@
       "NewVal_Shifted");
   Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
       Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
-      CI->getSuccessOrdering());
+      CI->getMergedOrdering());
   Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   Value *Res = UndefValue::get(CI->getType());
   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
@@ -1167,8 +1168,9 @@
   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
   // should preserve the ordering.
   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
-  AtomicOrdering MemOpOrder =
-      ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
+  AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
+                                  ? AtomicOrdering::Monotonic
+                                  : CI->getMergedOrdering();
 
   // In implementations which use a barrier to achieve release semantics, we can
   // delay emitting this barrier until we know a store is actually going to be
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -53,9 +53,9 @@
 
 define void @val_compare_and_swap_monotonic_seqcst(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
-; CHECK-LLSC-O1:    ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+; CHECK-LLSC-O1:    ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 ; [... LOTS of stuff that is generic IR unrelated to atomic operations ...]
-; CHECK-LLSC-O1:    stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+; CHECK-LLSC-O1:    stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 ;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
 ; CHECK-CAS-O1:    caspal x2, x3, x4, x5, [x0]
@@ -76,7 +76,7 @@
 
 define void @val_compare_and_swap_release_acquire(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-LABEL: val_compare_and_swap_release_acquire:
-; CHECK-LLSC-O1:    ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+; CHECK-LLSC-O1:    ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 ; [... LOTS of stuff that is generic IR unrelated to atomic operations ...]
 ; CHECK-LLSC-O1:    stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 ;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -214,12 +214,12 @@
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:  LBB4_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
 ; CHECK-NOLSE-O1-NEXT:    b.ne LBB4_4
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
 ; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB4_1 Depth=1
-; CHECK-NOLSE-O1-NEXT:    stxr w9, x2, [x0]
+; CHECK-NOLSE-O1-NEXT:    stlxr w9, x2, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB4_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.3: ; %cmpxchg.end
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
@@ -264,7 +264,7 @@
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:  LBB5_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
 ; CHECK-NOLSE-O1-NEXT:    b.ne LBB5_4
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -488,13 +488,16 @@
 ; PPC64LE-NEXT:  .LBB44_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB44_3
+; PPC64LE-NEXT:    bne 0, .LBB44_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB44_1
-; PPC64LE-NEXT:  .LBB44_3:
+; PPC64LE-NEXT:    bne 0, .LBB44_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB44_4:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release acquire
   ret void
@@ -706,13 +709,16 @@
 ; PPC64LE-NEXT:  .LBB54_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB54_3
+; PPC64LE-NEXT:    bne 0, .LBB54_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB54_1
-; PPC64LE-NEXT:  .LBB54_3:
+; PPC64LE-NEXT:    bne 0, .LBB54_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB54_4:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release acquire
   ret void
@@ -919,13 +925,16 @@
 ; PPC64LE-NEXT:  .LBB64_1:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB64_3
+; PPC64LE-NEXT:    bne 0, .LBB64_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB64_1
-; PPC64LE-NEXT:  .LBB64_3:
+; PPC64LE-NEXT:    bne 0, .LBB64_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB64_4:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release acquire
   ret void
@@ -1127,13 +1136,16 @@
 ; PPC64LE-NEXT:  .LBB74_1:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB74_3
+; PPC64LE-NEXT:    bne 0, .LBB74_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB74_1
-; PPC64LE-NEXT:  .LBB74_3:
+; PPC64LE-NEXT:    bne 0, .LBB74_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB74_4:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release acquire
   ret void
@@ -1340,13 +1352,16 @@
 ; PPC64LE-NEXT:  .LBB84_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB84_3
+; PPC64LE-NEXT:    bne 0, .LBB84_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB84_1
-; PPC64LE-NEXT:  .LBB84_3:
+; PPC64LE-NEXT:    bne 0, .LBB84_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB84_4:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
   ret void
@@ -1558,13 +1573,16 @@
 ; PPC64LE-NEXT:  .LBB94_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB94_3
+; PPC64LE-NEXT:    bne 0, .LBB94_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB94_1
-; PPC64LE-NEXT:  .LBB94_3:
+; PPC64LE-NEXT:    bne 0, .LBB94_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB94_4:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
   ret void
@@ -1771,13 +1789,16 @@
 ; PPC64LE-NEXT:  .LBB104_1:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB104_3
+; PPC64LE-NEXT:    bne 0, .LBB104_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB104_1
-; PPC64LE-NEXT:  .LBB104_3:
+; PPC64LE-NEXT:    bne 0, .LBB104_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB104_4:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
   ret void
@@ -1979,13 +2000,16 @@
 ; PPC64LE-NEXT:  .LBB114_1:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    bne 0, .LBB114_3
+; PPC64LE-NEXT:    bne 0, .LBB114_4
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:    b .LBB114_1
-; PPC64LE-NEXT:  .LBB114_3:
+; PPC64LE-NEXT:    bne 0, .LBB114_1
+; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB114_4:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
   ret void
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -337,7 +337,7 @@
 ; RV32IA-NEXT:    andi a2, a2, 255
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a2, (a3)
+; RV32IA-NEXT:    lr.w.aq a2, (a3)
 ; RV32IA-NEXT:    and a5, a2, a4
 ; RV32IA-NEXT:    bne a5, a1, .LBB4_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
@@ -373,7 +373,7 @@
 ; RV64IA-NEXT:    andi a2, a2, 255
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a2, (a3)
+; RV64IA-NEXT:    lr.w.aq a2, (a3)
 ; RV64IA-NEXT:    and a5, a2, a4
 ; RV64IA-NEXT:    bne a5, a1, .LBB4_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
@@ -1106,7 +1106,7 @@
 ; RV32IA-NEXT:    and a2, a2, a4
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a2, (a3)
+; RV32IA-NEXT:    lr.w.aq a2, (a3)
 ; RV32IA-NEXT:    and a4, a2, a5
 ; RV32IA-NEXT:    bne a4, a1, .LBB14_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1
@@ -1143,7 +1143,7 @@
 ; RV64IA-NEXT:    and a2, a2, a4
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a2, (a3)
+; RV64IA-NEXT:    lr.w.aq a2, (a3)
 ; RV64IA-NEXT:    and a4, a2, a5
 ; RV64IA-NEXT:    bne a4, a1, .LBB14_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1