diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -78,6 +78,7 @@
     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     bool expandAtomicStore(StoreInst *SI);
     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
+    AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
     Value *
     insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
                       Align AddrAlign, AtomicOrdering MemOpOrder,
@@ -281,9 +282,18 @@
       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
         MadeChange = true;
       } else {
+        AtomicRMWInst::BinOp Op = RMWI->getOperation();
+        if (Op == AtomicRMWInst::Xchg &&
+            RMWI->getValOperand()->getType()->isFloatingPointTy()) {
+          // TODO: add a TLI hook to control this so that each target can
+          // convert to lowering the original type one at a time.
+          RMWI = convertAtomicXchgToIntegerType(RMWI);
+          assert(RMWI->getValOperand()->getType()->isIntegerTy() &&
+                 "invariant broken");
+          MadeChange = true;
+        }
         unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
         unsigned ValueSize = getAtomicOpSize(RMWI);
-        AtomicRMWInst::BinOp Op = RMWI->getOperation();
         if (ValueSize < MinCASSize &&
             (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
              Op == AtomicRMWInst::And)) {
@@ -363,6 +373,32 @@
   return NewLI;
 }
 
+AtomicRMWInst *
+AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
+  auto *M = RMWI->getModule();
+  Type *NewTy =
+      getCorrespondingIntegerType(RMWI->getType(), M->getDataLayout());
+
+  IRBuilder<> Builder(RMWI);
+
+  Value *Addr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+  Type *PT = PointerType::get(NewTy, RMWI->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+  Value *NewVal = Builder.CreateBitCast(Val, NewTy);
+
+  auto *NewRMWI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, NewAddr, NewVal,
+                              RMWI->getAlign(), RMWI->getOrdering());
+  NewRMWI->setVolatile(RMWI->isVolatile());
+  LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");
+
+  Value *NewRVal = Builder.CreateBitCast(NewRMWI, RMWI->getType());
+  RMWI->replaceAllUsesWith(NewRVal);
+  RMWI->eraseFromParent();
+  return NewRMWI;
+}
+
 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=NOLSE
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=LSE
+
+define half @test_rmw_xchg_f16(half* %dst, half %new) {
+; NOLSE-LABEL: test_rmw_xchg_f16:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w8, s0
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    stlxrh w10, w8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_xchg_f16:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    swpalh w8, w8, [x0]
+; LSE-NEXT:    fmov s0, w8
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+  %res = atomicrmw xchg half* %dst, half %new seq_cst
+  ret half %res
+}
+
+define float @test_rmw_xchg_f32(float* %dst, float %new) {
+; NOLSE-LABEL: test_rmw_xchg_f32:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, w8
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_xchg_f32:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    swpal w8, w8, [x0]
+; LSE-NEXT:    fmov s0, w8
+; LSE-NEXT:    ret
+  %res = atomicrmw xchg float* %dst, float %new seq_cst
+  ret float %res
+}
+
+define double @test_rmw_xchg_f64(double* %dst, double %new) {
+; NOLSE-LABEL: test_rmw_xchg_f64:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fmov x8, d0
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x9, [x0]
+; NOLSE-NEXT:    stlxr w10, x8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x9
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_xchg_f64:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fmov x8, d0
+; LSE-NEXT:    swpal x8, x8, [x0]
+; LSE-NEXT:    fmov d0, x8
+; LSE-NEXT:    ret
+  %res = atomicrmw xchg double* %dst, double %new seq_cst
+  ret double %res
+}
+
+define fp128 @test_rmw_xchg_f128(fp128* %dst, fp128 %new) {
+; NOLSE-LABEL: test_rmw_xchg_f128:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str q0, [sp, #16]
+; NOLSE-NEXT:    ldp x9, x8, [sp, #16]
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x11, x10, [x0]
+; NOLSE-NEXT:    stlxp w12, x9, x8, [x0]
+; NOLSE-NEXT:    cbnz w12, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    stp x11, x10, [sp]
+; NOLSE-NEXT:    ldr q0, [sp], #32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_xchg_f128:
+; LSE:       // %bb.0:
+; LSE-NEXT:    sub sp, sp, #32 // =32
+; LSE-NEXT:    .cfi_def_cfa_offset 32
+; LSE-NEXT:    str q0, [sp, #16]
+; LSE-NEXT:    ldp x9, x8, [sp, #16]
+; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldaxp x11, x10, [x0]
+; LSE-NEXT:    stlxp w12, x9, x8, [x0]
+; LSE-NEXT:    cbnz w12, .LBB3_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    stp x11, x10, [sp]
+; LSE-NEXT:    ldr q0, [sp], #32
+; LSE-NEXT:    ret
+  %res = atomicrmw xchg fp128* %dst, fp128 %new seq_cst
+  ret fp128 %res
+}
diff --git a/llvm/test/CodeGen/X86/atomicf128.ll b/llvm/test/CodeGen/X86/atomicf128.ll
--- a/llvm/test/CodeGen/X86/atomicf128.ll
+++ b/llvm/test/CodeGen/X86/atomicf128.ll
@@ -10,21 +10,16 @@
 ; CHECK-LABEL: atomic_fetch_swapf128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    movq _fsc128@{{.*}}(%rip), %rsi
-; CHECK-NEXT:    movaps (%rsi), %xmm1
+; CHECK-NEXT:    movq (%rsi), %rax
+; CHECK-NEXT:    movq 8(%rsi), %rdx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %atomicrmw.start
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    lock cmpxchg16b (%rsi)
-; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.2: ## %atomicrmw.end
 ; CHECK-NEXT:    popq %rbx
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -4,21 +4,25 @@
 
 define void @atomic_swap_f16(half* %ptr, half %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[PTR:%.*]] to i16*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f16(half* [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[VAL:%.*]] to i16
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f16(i64 [[TMP5]], half* [[PTR]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0i16(i64 [[TMP5]], i16* [[TMP1]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP4]] to half
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f16(
-; OUTLINE-ATOMICS-NEXT:    [[T1:%.*]] = atomicrmw xchg half* [[PTR:%.*]], half [[VAL:%.*]] acquire
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast half* [[PTR:%.*]] to i16*
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast half [[VAL:%.*]] to i16
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg i16* [[TMP1]], i16 [[TMP2]] acquire, align 2
+; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP3]] to half
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg half* %ptr, half %val acquire
@@ -27,21 +31,25 @@
 
 define void @atomic_swap_f32(float* %ptr, float %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTR:%.*]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f32(float* [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[VAL:%.*]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f32(i64 [[TMP5]], float* [[PTR]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP5]], i32* [[TMP1]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP4]] to float
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f32(
-; OUTLINE-ATOMICS-NEXT:    [[T1:%.*]] = atomicrmw xchg float* [[PTR:%.*]], float [[VAL:%.*]] acquire
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTR:%.*]] to i32*
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast float [[VAL:%.*]] to i32
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg i32* [[TMP1]], i32 [[TMP2]] acquire, align 4
+; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg float* %ptr, float %val acquire
@@ -50,19 +58,23 @@
 
 define void @atomic_swap_f64(double* %ptr, double %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTR:%.*]] to i64*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f64(double* [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[VAL:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.aarch64.stxr.p0f64(i64 [[TMP3]], double* [[PTR]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.aarch64.stxr.p0i64(i64 [[TMP2]], i64* [[TMP1]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP3]] to double
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f64(
-; OUTLINE-ATOMICS-NEXT:    [[T1:%.*]] = atomicrmw xchg double* [[PTR:%.*]], double [[VAL:%.*]] acquire
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTR:%.*]] to i64*
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast double [[VAL:%.*]] to i64
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg i64* [[TMP1]], i64 [[TMP2]] acquire, align 8
+; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP3]] to double
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg double* %ptr, double %val acquire
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@@ -3,19 +3,18 @@
 
 define double @atomic_xchg_f64(double* %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTR:%.*]] to i64*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[PTR]] to i64*
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = cmpxchg i64* [[TMP1]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret double [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    ret double [[TMP4]]
 ;
   %result = atomicrmw xchg double* %ptr, double 4.0 seq_cst
   ret double %result
@@ -23,19 +22,18 @@
 
 define double @atomic_xchg_f64_as1(double addrspace(1)* %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double addrspace(1)* [[PTR:%.*]] to i64 addrspace(1)*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64 addrspace(1)* [[TMP1]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)*
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = cmpxchg i64 addrspace(1)* [[TMP1]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret double [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    ret double [[TMP4]]
 ;
   %result = atomicrmw xchg double addrspace(1)* %ptr, double 4.0 seq_cst
   ret double %result