Index: lib/CodeGen/CGAtomic.cpp
===================================================================
--- lib/CodeGen/CGAtomic.cpp
+++ lib/CodeGen/CGAtomic.cpp
@@ -465,12 +465,15 @@
 static void
 AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
                   bool UseOptimizedLibcall, llvm::Value *Val, QualType ValTy,
-                  SourceLocation Loc) {
+                  SourceLocation Loc, uint64_t Size) {
   if (UseOptimizedLibcall) {
     // Load value and pass it to the function directly.
     unsigned Align = CGF.getContext().getTypeAlignInChars(ValTy).getQuantity();
     Val = CGF.EmitLoadOfScalar(Val, false, Align, ValTy, Loc);
-    Args.add(RValue::get(Val), ValTy);
+    // Coerce the value into an appropriately sized integer type.
+    llvm::Type *ITy = llvm::IntegerType::get(CGF.getLLVMContext(), Size * 8);
+    Args.add(RValue::get(CGF.Builder.CreateBitCast(Val, ITy)),
+             CGF.getContext().getIntTypeForBitwidth(Size * 8, /*Signed=*/true));
   } else {
     // Non-optimized functions always take a reference.
     Args.add(RValue::get(CGF.EmitCastToVoidPtr(Val)),
@@ -638,7 +641,7 @@
       HaveRetTy = true;
       Args.add(RValue::get(EmitCastToVoidPtr(Val1)), getContext().VoidPtrTy);
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val2, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       Args.add(RValue::get(Order), getContext().IntTy);
       Order = OrderFail;
       break;
@@ -650,7 +653,7 @@
     case AtomicExpr::AO__atomic_exchange:
       LibCallName = "__atomic_exchange";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // void __atomic_store(size_t size, void *mem, void *val, int order)
     // void __atomic_store_N(T *mem, T val, int order)
@@ -661,7 +664,7 @@
       RetTy = getContext().VoidTy;
       HaveRetTy = true;
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // void __atomic_load(size_t size, void *mem, void *return, int order)
     // T __atomic_load_N(T *mem, int order)
@@ -675,35 +678,35 @@
     case AtomicExpr::AO__atomic_fetch_add:
       LibCallName = "__atomic_fetch_add";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, LoweredMemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // T __atomic_fetch_and_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_fetch_and:
     case AtomicExpr::AO__atomic_fetch_and:
       LibCallName = "__atomic_fetch_and";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // T __atomic_fetch_or_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_fetch_or:
     case AtomicExpr::AO__atomic_fetch_or:
       LibCallName = "__atomic_fetch_or";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // T __atomic_fetch_sub_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_fetch_sub:
     case AtomicExpr::AO__atomic_fetch_sub:
       LibCallName = "__atomic_fetch_sub";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, LoweredMemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     // T __atomic_fetch_xor_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_fetch_xor:
     case AtomicExpr::AO__atomic_fetch_xor:
       LibCallName = "__atomic_fetch_xor";
       AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
-                        E->getExprLoc());
+                        E->getExprLoc(), Size);
       break;
     default: return EmitUnsupportedRValue(E, "atomic library call");
     }
@@ -715,7 +718,8 @@
     if (!HaveRetTy) {
       if (UseOptimizedLibcall) {
         // Value is returned directly.
-        RetTy = MemTy;
+        // The function returns an appropriately sized integer type.
+        RetTy = getContext().getIntTypeForBitwidth(Size * 8, /*Signed=*/true);
       } else {
         // Value is returned through parameter before the order.
         RetTy = getContext().VoidTy;
@@ -733,8 +737,16 @@
     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
     llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
     RValue Res = EmitCall(FuncInfo, Func, ReturnValueSlot(), Args);
-    if (!RetTy->isVoidType())
+    if (!RetTy->isVoidType()) {
+      if (UseOptimizedLibcall && !HaveRetTy) {
+        llvm::StoreInst *StoreDest = Builder.CreateStore(
+            Builder.CreateBitCast(Res.getScalarVal(),
+                                  Dest->getType()->getPointerElementType()),
+            Dest);
+        StoreDest->setAlignment(Align);
+      }
       return Res;
+    }
     if (E->getType()->isVoidType())
       return RValue::get(nullptr);
     return convertTempToRValue(Dest, E->getType(), E->getExprLoc());
Index: test/CodeGen/atomic-ops.c
===================================================================
--- test/CodeGen/atomic-ops.c
+++ test/CodeGen/atomic-ops.c
@@ -139,6 +139,19 @@
   return __c11_atomic_exchange(d, 2, memory_order_seq_cst);
 }
 
+double fd1(double *d) {
+  // CHECK-LABEL: @fd1
+  // CHECK: [[RET:%.*]] = alloca double, align 8
+  // CHECK: [[CALL:%.*]] = call i64 @__atomic_load_8
+  // CHECK: [[BITCAST:%.*]] = bitcast i64 %call to double
+  // CHECK: store double [[BITCAST]], double* [[RET]], align 4
+  // CHECK: [[LOAD:%.*]] = load double* [[RET]], align 8
+  // CHECK: ret double [[LOAD]]
+  double ret;
+  __atomic_load(d, &ret, memory_order_seq_cst);
+  return ret;
+}
+
 int* fp1(_Atomic(int*) *p) {
   // CHECK-LABEL: @fp1
   // CHECK: load atomic i32* {{.*}} seq_cst