diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -825,6 +825,45 @@
   auto TInfo = getContext().getTypeInfoInChars(AtomicTy);
   uint64_t Size = TInfo.Width.getQuantity();
   unsigned MaxInlineWidthInBits = getTarget().getMaxAtomicInlineWidth();
+  unsigned MaxPromoteWidthInBits = getTarget().getMaxAtomicPromoteWidth();
+
+  switch (E->getOp()) {
+  case AtomicExpr::AO__atomic_load_n:
+  case AtomicExpr::AO__atomic_load:
+  case AtomicExpr::AO__atomic_store_n:
+  case AtomicExpr::AO__atomic_store:
+  case AtomicExpr::AO__atomic_exchange_n:
+  case AtomicExpr::AO__atomic_exchange:
+  case AtomicExpr::AO__atomic_compare_exchange_n:
+  case AtomicExpr::AO__atomic_compare_exchange:
+  case AtomicExpr::AO__atomic_fetch_add:
+  case AtomicExpr::AO__atomic_fetch_sub:
+  case AtomicExpr::AO__atomic_add_fetch:
+  case AtomicExpr::AO__atomic_sub_fetch:
+  case AtomicExpr::AO__atomic_fetch_and:
+  case AtomicExpr::AO__atomic_fetch_or:
+  case AtomicExpr::AO__atomic_fetch_xor:
+  case AtomicExpr::AO__atomic_fetch_nand:
+  case AtomicExpr::AO__atomic_and_fetch:
+  case AtomicExpr::AO__atomic_or_fetch:
+  case AtomicExpr::AO__atomic_xor_fetch:
+  case AtomicExpr::AO__atomic_nand_fetch:
+  case AtomicExpr::AO__atomic_max_fetch:
+  case AtomicExpr::AO__atomic_min_fetch:
+  case AtomicExpr::AO__atomic_fetch_max:
+  case AtomicExpr::AO__atomic_fetch_min:
+    // According to gcc, if an __atomic_* builtin is inlineable, we should
+    // assume the value is naturally aligned. This isn't documented anywhere,
+    // but it follows from the generated assembly.
+    //
+    // (In other cases, it's hard to tell what gcc expects.)
+    if (TInfo.Width.isPowerOfTwo() && Ptr.getAlignment() < TInfo.Width &&
+        getContext().toBits(TInfo.Width) <= MaxPromoteWidthInBits)
+      Ptr = Ptr.withAlignment(TInfo.Width);
+    break;
+  default:
+    break;
+  }
 
   bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
   bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
diff --git a/clang/test/CodeGen/atomic-ops.c b/clang/test/CodeGen/atomic-ops.c
--- a/clang/test/CodeGen/atomic-ops.c
+++ b/clang/test/CodeGen/atomic-ops.c
@@ -200,9 +200,8 @@
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
   // CHECK: [[A:%.*]]   = bitcast %struct.S* {{.*}} to i64*
   // CHECK: [[CAST:%.*]]  = bitcast %struct.S* [[RETVAL]] to i64*
-  // CHECK: [[SRC:%.*]]  = bitcast i64* [[A]] to i8*
-  // CHECK: [[DEST:%.*]]  = bitcast i64* [[CAST]] to i8*
-  // CHECK: call void @__atomic_load(i32 noundef 8, i8* noundef [[SRC]], i8* noundef [[DEST]], i32 noundef 5)
+  // CHECK: load atomic i64, i64* [[A]]
+  // CHECK: store i64 {{.*}}, i64* [[CAST]]
   // CHECK: ret
   struct S ret;
   __atomic_load(a, &ret, memory_order_seq_cst);
@@ -219,9 +218,8 @@
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load %struct.S*, %struct.S** [[B_ADDR]], align 4
   // CHECK-NEXT: [[COERCED_A_TMP:%.*]] = bitcast %struct.S* [[LOAD_A_PTR]] to i64*
   // CHECK-NEXT: [[COERCED_B:%.*]] = bitcast %struct.S* [[LOAD_B_PTR]] to i64*
-  // CHECK-NEXT: [[COERCED_A:%.*]] = bitcast i64* [[COERCED_A_TMP]] to i8*
-  // CHECK-NEXT: [[CAST_B:%.*]] = bitcast i64* [[COERCED_B]] to i8*
-  // CHECK-NEXT: call void @__atomic_store(i32 noundef 8, i8* noundef [[COERCED_A]], i8* noundef [[CAST_B]],
+  // CHECK-NEXT: load i64, i64* [[COERCED_B]]
+  // CHECK-NEXT: store atomic i64 {{.*}}, i64* [[COERCED_A_TMP]]
   // CHECK-NEXT: ret void
   __atomic_store(a, b, memory_order_seq_cst);
 }
@@ -240,11 +238,8 @@
   // CHECK-NEXT: [[COERCED_A_TMP:%.*]] = bitcast %struct.S* [[LOAD_A_PTR]] to i64*
   // CHECK-NEXT: [[COERCED_B:%.*]] = bitcast %struct.S* [[LOAD_B_PTR]] to i64*
   // CHECK-NEXT: [[COERCED_C:%.*]] = bitcast %struct.S* [[LOAD_C_PTR]] to i64*
-  // CHECK-NEXT: [[COERCED_A:%.*]] = bitcast i64* [[COERCED_A_TMP]] to i8*
-  // CHECK-NEXT: [[CAST_B:%.*]] = bitcast i64* [[COERCED_B]] to i8*
-  // CHECK-NEXT: [[CAST_C:%.*]] = bitcast i64* [[COERCED_C]] to i8*
-  // CHECK-NEXT: call void @__atomic_exchange(i32 noundef 8, i8* noundef [[COERCED_A]], i8* noundef [[CAST_B]], i8* noundef [[CAST_C]],
-
+  // CHECK-NEXT: load i64, i64* [[COERCED_B]]
+  // CHECK-NEXT: atomicrmw xchg i64* [[COERCED_A_TMP]]
   __atomic_exchange(a, b, c, memory_order_seq_cst);
 }
 
@@ -262,11 +257,9 @@
   // CHECK-NEXT: [[COERCED_A_TMP:%.*]] = bitcast %struct.S* [[LOAD_A_PTR]] to i64*
   // CHECK-NEXT: [[COERCED_B_TMP:%.*]] = bitcast %struct.S* [[LOAD_B_PTR]] to i64*
   // CHECK-NEXT: [[COERCED_C:%.*]] = bitcast %struct.S* [[LOAD_C_PTR]] to i64*
-  // CHECK-NEXT: [[COERCED_A:%.*]] = bitcast i64* [[COERCED_A_TMP]] to i8*
-  // CHECK-NEXT: [[COERCED_B:%.*]] = bitcast i64* [[COERCED_B_TMP]] to i8*
-  // CHECK-NEXT: [[CAST_C:%.*]] = bitcast i64* [[COERCED_C]] to i8*
-  // CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 8, i8* noundef [[COERCED_A]], i8* noundef [[COERCED_B]], i8* noundef [[CAST_C]],
-  // CHECK-NEXT: ret i1 [[CALL]]
+  // CHECK-NEXT: load i64
+  // CHECK-NEXT: load i64
+  // CHECK-NEXT: cmpxchg weak i64
   return __atomic_compare_exchange(a, b, c, 1, 5, 5);
 }
 
@@ -702,13 +695,13 @@
   // CHECK-LABEL: @test_underaligned
   struct Underaligned { char c[8]; } underaligned_a, underaligned_b, underaligned_c;
 
-  // CHECK: call void @__atomic_load(i32 noundef 8,
+  // CHECK: load atomic i64, {{.*}}, align 8
   __atomic_load(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_store(i32 noundef 8,
+  // CHECK: store atomic i64 {{.*}}, align 8
   __atomic_store(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_exchange(i32 noundef 8,
+  // CHECK: atomicrmw xchg i64* {{.*}}, align 8
   __atomic_exchange(&underaligned_a, &underaligned_b, &underaligned_c, memory_order_seq_cst);
-  // CHECK: call {{.*}} @__atomic_compare_exchange(i32 noundef 8,
+  // CHECK: cmpxchg weak i64* {{.*}}, align 8
   __atomic_compare_exchange(&underaligned_a, &underaligned_b, &underaligned_c, 1, memory_order_seq_cst, memory_order_seq_cst);
 
   __attribute__((aligned)) struct Underaligned aligned_a, aligned_b, aligned_c;
@@ -792,7 +785,7 @@
   // CHECK: store i8 [[NEW]], i8*
   *sc = __atomic_min_fetch(sc, 42, memory_order_release);
 
-  // CHECK: [[OLD:%.*]] = call i64 @__atomic_fetch_umin_8(i8* noundef {{%.*}}, i64 noundef [[RHS:%.*]],
+  // CHECK: [[OLD:%.*]] = atomicrmw umin i64* [[PTR:%.*]], i64 [[RHS:%.*]] release, align 8
   // CHECK: [[TST:%.*]] = icmp ult i64 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i64 [[OLD]], i64 [[RHS]]
   // CHECK: store i64 [[NEW]], i64*
diff --git a/clang/test/CodeGen/atomics-sema-alignment.c b/clang/test/CodeGen/atomics-sema-alignment.c
--- a/clang/test/CodeGen/atomics-sema-alignment.c
+++ b/clang/test/CodeGen/atomics-sema-alignment.c
@@ -12,10 +12,10 @@
 
 void func(IntPair *p) {
   IntPair res;
-  __atomic_load(p, &res, 0);                    // expected-warning {{misaligned atomic operation may incur significant performance penalty; the expected alignment (8 bytes) exceeds the actual alignment (4 bytes)}}
-  __atomic_store(p, &res, 0);                   // expected-warning {{misaligned atomic operation may incur significant performance penalty; the expected alignment (8 bytes) exceeds the actual alignment (4 bytes)}}
-  __atomic_fetch_add((unaligned_int *)p, 1, 2); // expected-warning {{misaligned atomic operation may incur significant performance penalty; the expected alignment (4 bytes) exceeds the actual alignment (1 bytes)}}
-  __atomic_fetch_sub((unaligned_int *)p, 1, 3); // expected-warning {{misaligned atomic operation may incur significant performance penalty; the expected alignment (4 bytes) exceeds the actual alignment (1 bytes)}}
+  __atomic_load(p, &res, 0);
+  __atomic_store(p, &res, 0);
+  __atomic_fetch_add((unaligned_int *)p, 1, 2);
+  __atomic_fetch_sub((unaligned_int *)p, 1, 3);
 }
 
 void func1(LongStruct *p) {
@@ -40,7 +40,7 @@
 
 void braz(Foo *foo, ThirtyTwo *braz) {
   Foo bar;
-  __atomic_load(foo, &bar, __ATOMIC_RELAXED); // expected-warning {{misaligned atomic operation may incur significant performance penalty; the expected alignment (16 bytes) exceeds the actual alignment (8 bytes)}}
+  __atomic_load(foo, &bar, __ATOMIC_RELAXED);
 
   ThirtyTwo thirtyTwo1;
   ThirtyTwo thirtyTwo2;