diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -309,6 +309,10 @@
   "; the access size (%0 bytes) exceeds the max lock-free size (%1  bytes)">,
 InGroup<AtomicAlignment>;
 
+def warn_sync_op_misaligned : Warning<
+  "__sync builtin operation MUST have natural alignment (consider using __atomic).">,
+  InGroup<SyncAlignment>;
+
 def warn_alias_with_section : Warning<
   "%select{alias|ifunc}1 will not be in section '%0' but in the same section "
   "as the %select{aliasee|resolver}2">,
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -801,6 +801,7 @@
 def CustomAtomic : DiagGroup<"custom-atomic-properties">;
 def AtomicProperties : DiagGroup<"atomic-properties",
                                  [ImplicitAtomic, CustomAtomic]>;
+def SyncAlignment : DiagGroup<"sync-alignment">;
 def ARCUnsafeRetainedAssign : DiagGroup<"arc-unsafe-retained-assign">;
 def ARCRetainCycles : DiagGroup<"arc-retain-cycles">;
 def ARCNonPodMemAccess : DiagGroup<"arc-non-pod-memaccess">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -28,6 +28,7 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
+#include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -223,9 +224,23 @@
   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
 }
 
+static void CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E) {
+  ASTContext &Ctx = CGF.getContext();
+  Address Ptr = CGF.EmitPointerWithAlignment(E->getArg(0));
+  unsigned Bytes = Ptr.getElementType()->isPointerTy()
+                       ? Ctx.getTypeSizeInChars(Ctx.VoidPtrTy).getQuantity()
+                       : Ptr.getElementType()->getScalarSizeInBits() / 8;
+  unsigned Align = Ptr.getAlignment().getQuantity();
+  if (Align % Bytes != 0) {
+    DiagnosticsEngine &Diags = CGF.CGM.getDiags();
+    Diags.Report(E->getBeginLoc(), diag::warn_sync_op_misaligned);
+  }
+}
+
 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
                                llvm::AtomicRMWInst::BinOp Kind,
                                const CallExpr *E) {
+  CheckAtomicAlignment(CGF, E);
   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
 }
 
@@ -237,6 +252,7 @@
                                    const CallExpr *E,
                                    Instruction::BinaryOps Op,
                                    bool Invert = false) {
+  CheckAtomicAlignment(CGF, E);
   QualType T = E->getType();
   assert(E->getArg(0)->getType()->isPointerType());
   assert(CGF.getContext().hasSameUnqualifiedType(T,
@@ -284,6 +300,7 @@
 /// invoke the function EmitAtomicCmpXchgForMSIntrin.
 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
                                      bool ReturnBool) {
+  CheckAtomicAlignment(CGF, E);
   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
@@ -4016,6 +4033,7 @@
   case Builtin::BI__sync_lock_release_4:
   case Builtin::BI__sync_lock_release_8:
   case Builtin::BI__sync_lock_release_16: {
+    CheckAtomicAlignment(*this, E);
     Value *Ptr = EmitScalarExpr(E->getArg(0));
     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
@@ -0,0 +1,206 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O1 -emit-llvm %s -o - \
+// RUN:   | FileCheck %s
+//
+// Test __sync_ builtins for __int128 aligned to 16 bytes.
+
+#include <stdint.h>
+
+__int128 Ptr __attribute__((aligned(16)));
+__int128 Val __attribute__((aligned(16)));
+__int128 OldVal __attribute__((aligned(16)));
+
+// CHECK-LABEL: @f1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f1() {
+  return __sync_fetch_and_add(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f2() {
+  return __sync_fetch_and_sub(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f3() {
+  return __sync_fetch_and_or(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f4() {
+  return __sync_fetch_and_and(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f5() {
+  return __sync_fetch_and_xor(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f6(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f6() {
+  return __sync_fetch_and_nand(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f7() {
+  return __sync_add_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f8() {
+  return __sync_sub_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f9(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f9() {
+  return __sync_or_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f10(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f10() {
+  return __sync_and_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f11(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f11() {
+  return __sync_xor_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f12(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f12() {
+  return __sync_nand_and_fetch(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f13(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    ret i1 [[TMP3]]
+//
+_Bool f13() {
+  return __sync_bool_compare_and_swap(&Ptr, OldVal, Val);
+}
+
+// CHECK-LABEL: @f14(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f14() {
+  return __sync_val_compare_and_swap(&Ptr, OldVal, Val);
+}
+
+// CHECK-LABEL: @f15(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f15() {
+  return __sync_lock_test_and_set(&Ptr, Val);
+}
+
+// CHECK-LABEL: @f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    store atomic i128 0, ptr @Ptr release, align 16
+// CHECK-NEXT:    ret void
+//
+void f16() {
+  return __sync_lock_release(&Ptr);
+}
+
+// CHECK-LABEL: @f17(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+__int128 f17() {
+  return __sync_swap(&Ptr, Val);
+}
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O1 -emit-llvm %s -o - 2>&1 | FileCheck %s
+//
+// Test that an underaligned 16 byte __sync gives a warning.
+
+#include <stdint.h>
+
+__int128 Ptr __attribute__((aligned(8)));
+
+__int128 f1() {
+// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+  return __sync_fetch_and_add(&Ptr, 1);
+}
+
+__int128 f2() {
+// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+  return __sync_sub_and_fetch(&Ptr, 1);
+}
+
+__int128 f3() {
+// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+  return __sync_val_compare_and_swap(&Ptr, 0, 1);
+}
+
+void f4() {
+// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+  __sync_lock_release(&Ptr);
+}