diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -9701,7 +9701,7 @@
 
 ::
 
-      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering>                   ; yields ty
+      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering>[, align <alignment>]  ; yields ty
 
 Overview:
 """""""""
@@ -9739,6 +9739,13 @@
 allowed to modify the number or order of execution of this
 ``atomicrmw`` with other :ref:`volatile operations <volatile>`.
 
+The instruction can take an optional ``align`` attribute.
+The alignment must be a power of two greater or equal to the size of the
+`<value>` type. If unspecified, the alignment is assumed to be equal to the
+ size of the '<value>' type. Note that this default alignment assumption is
+ different from the alignment used for the load/store instructions when align
+ isn't specified.
+
 A ``atomicrmw`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
 
@@ -9759,10 +9766,8 @@
 -  xor: ``*ptr = *ptr ^ val``
 -  max: ``*ptr = *ptr > val ? *ptr : val`` (using a signed comparison)
 -  min: ``*ptr = *ptr < val ? *ptr : val`` (using a signed comparison)
--  umax: ``*ptr = *ptr > val ? *ptr : val`` (using an unsigned
-   comparison)
--  umin: ``*ptr = *ptr < val ? *ptr : val`` (using an unsigned
-   comparison)
+-  umax: ``*ptr = *ptr > val ? *ptr : val`` (using an unsigned comparison)
+-  umin: ``*ptr = *ptr < val ? *ptr : val`` (using an unsigned comparison)
 - fadd: ``*ptr = *ptr + val`` (using floating point arithmetic)
 - fsub: ``*ptr = *ptr - val`` (using floating point arithmetic)
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7499,6 +7499,7 @@
   bool isVolatile = false;
   bool IsFP = false;
   AtomicRMWInst::BinOp Operation;
+  MaybeAlign Alignment;
 
   if (EatIfPresent(lltok::kw_volatile))
     isVolatile = true;
@@ -7531,7 +7532,8 @@
   if (parseTypeAndValue(Ptr, PtrLoc, PFS) ||
       parseToken(lltok::comma, "expected ',' after atomicrmw address") ||
       parseTypeAndValue(Val, ValLoc, PFS) ||
-      parseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
+      parseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering) ||
+      parseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
   if (Ordering == AtomicOrdering::Unordered)
@@ -7566,11 +7568,12 @@
   if (Size < 8 || (Size & (Size - 1)))
     return error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
-  Align Alignment(
+  const Align DefaultAlignment(
       PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
           Val->getType()));
   AtomicRMWInst *RMWI =
-      new AtomicRMWInst(Operation, Ptr, Val, Alignment, Ordering, SSID);
+      new AtomicRMWInst(Operation, Ptr, Val,
+                        Alignment.getValueOr(DefaultAlignment), Ordering, SSID);
   RMWI->setVolatile(isVolatile);
   Inst = RMWI;
   return AteExtraComma ? InstExtraComma : InstNormal;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5131,29 +5131,55 @@
       break;
     }
     case bitc::FUNC_CODE_INST_ATOMICRMW: {
-      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, ssid]
+      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, ssid, align?]
+      const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
-      Value *Ptr, *Val;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy) ||
-          !isa<PointerType>(Ptr->getType()) ||
-          popValue(Record, OpNum, NextValueNo,
-                   getPointerElementFlatType(FullTy), Val) ||
-          OpNum + 4 != Record.size())
+
+      Value *Ptr = nullptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy))
         return error("Invalid record");
-      AtomicRMWInst::BinOp Operation = getDecodedRMWOperation(Record[OpNum]);
+
+      if (!isa<PointerType>(Ptr->getType()))
+        return error("Invalid record");
+
+      Value *Val = nullptr;
+      if (popValue(Record, OpNum, NextValueNo,
+                   getPointerElementFlatType(FullTy), Val))
+        return error("Invalid record");
+
+      if (!(NumRecords == (OpNum + 4) || NumRecords == (OpNum + 5)))
+        return error("Invalid record");
+
+      const AtomicRMWInst::BinOp Operation =
+          getDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
         return error("Invalid record");
-      AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
+
+      const bool IsVol = Record[OpNum + 1];
+
+      const AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
       if (Ordering == AtomicOrdering::NotAtomic ||
           Ordering == AtomicOrdering::Unordered)
         return error("Invalid record");
-      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
-      Align Alignment(
-          TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
-      I = new AtomicRMWInst(Operation, Ptr, Val, Alignment, Ordering, SSID);
+
+      const SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
+
+      MaybeAlign Alignment;
+
+      if (NumRecords == (OpNum + 5)) {
+        if (Error Err = parseAlignmentValue(Record[6], Alignment))
+          return Err;
+      }
+
+      if (!Alignment)
+        Alignment =
+            Align(TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
+
+      I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID);
       FullTy = getPointerElementFlatType(FullTy);
-      cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
+      cast<AtomicRMWInst>(I)->setVolatile(IsVol);
+
       InstructionList.push_back(I);
       break;
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -3071,6 +3071,7 @@
     Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
     Vals.push_back(
         getEncodedSyncScopeID(cast<AtomicRMWInst>(I).getSyncScopeID()));
+    Vals.push_back(getEncodedAlign(cast<AtomicRMWInst>(I).getAlign()));
     break;
   case Instruction::Fence:
     Code = bitc::FUNC_CODE_INST_FENCE;
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4327,6 +4327,7 @@
   } else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
     writeAtomic(RMWI->getContext(), RMWI->getOrdering(),
                 RMWI->getSyncScopeID());
+    Out << ", align " << RMWI->getAlign().value();
   } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
     writeAtomic(FI->getContext(), FI->getOrdering(), FI->getSyncScopeID());
   } else if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -734,28 +734,55 @@
   ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
   %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
   ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic
-  %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
-  ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
-  %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
-  ; CHECK: %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
-  %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic
-  ; CHECK: %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic
-  %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic
-  ; CHECK: %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic
-  %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic
-  ; CHECK: %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic
-  %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic
-  ; CHECK: %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic
-  %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic
-  ; CHECK: %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic
-  %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
-  ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
-  %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
-  ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
-  %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
-  ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
-  %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
-  ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+
+  ;; Atomic w/o alignment
+  %atomicrmw_no_align.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
+  ; CHECK: %atomicrmw_no_align.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
+  %atomicrmw_no_align.add = atomicrmw add i32* %word, i32 13 monotonic
+  ; CHECK: %atomicrmw_no_align.add = atomicrmw add i32* %word, i32 13 monotonic
+  %atomicrmw_no_align.sub = atomicrmw sub i32* %word, i32 14 monotonic
+  ; CHECK: %atomicrmw_no_align.sub = atomicrmw sub i32* %word, i32 14 monotonic
+  %atomicrmw_no_align.and = atomicrmw and i32* %word, i32 15 monotonic
+  ; CHECK: %atomicrmw_no_align.and = atomicrmw and i32* %word, i32 15 monotonic
+  %atomicrmw_no_align.nand = atomicrmw nand i32* %word, i32 16 monotonic
+  ; CHECK: %atomicrmw_no_align.nand = atomicrmw nand i32* %word, i32 16 monotonic
+  %atomicrmw_no_align.or = atomicrmw or i32* %word, i32 17 monotonic
+  ; CHECK: %atomicrmw_no_align.or = atomicrmw or i32* %word, i32 17 monotonic
+  %atomicrmw_no_align.xor = atomicrmw xor i32* %word, i32 18 monotonic
+  ; CHECK: %atomicrmw_no_align.xor = atomicrmw xor i32* %word, i32 18 monotonic
+  %atomicrmw_no_align.max = atomicrmw max i32* %word, i32 19 monotonic
+  ; CHECK: %atomicrmw_no_align.max = atomicrmw max i32* %word, i32 19 monotonic
+  %atomicrmw_no_align.min = atomicrmw volatile min i32* %word, i32 20 monotonic
+  ; CHECK: %atomicrmw_no_align.min = atomicrmw volatile min i32* %word, i32 20 monotonic
+  %atomicrmw_no_align.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+  ; CHECK: %atomicrmw_no_align.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic
+  %atomicrmw_no_align.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+  ; CHECK: %atomicrmw_no_align.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic
+
+  ;; Atomic w/ alignment
+  %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic, align 16
+  ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic, align 16
+  %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic, align 16
+  ; CHECK: %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic, align 16
+  %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic, align 16
+  ; CHECK: %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic, align 16
+  %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic, align 16
+  ; CHECK: %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic, align 16
+  %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic, align 16
+  ; CHECK: %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic, align 16
+  %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic, align 16
+  ; CHECK: %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic, align 16
+  %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic, align 16
+  ; CHECK: %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic, align 16
+  %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic, align 16
+  ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic, align 16
+  %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic, align 16
+  ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic, align 16
+  %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic, align 16
+  ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic, align 16
+  %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic, align 16
+  ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic, align 16
+  
   fence acquire
   ; CHECK: fence acquire
   fence release
diff --git a/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll b/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll
--- a/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll
+++ b/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll
@@ -4,7 +4,7 @@
 
 ; CHECK-LABEL: void @empty()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    %0 = atomicrmw add i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__llvm_gcov_ctr, i64 0, i64 0), i64 1 monotonic, !dbg [[DBG:![0-9]+]]
+; CHECK-NEXT:    %0 = atomicrmw add i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__llvm_gcov_ctr, i64 0, i64 0), i64 1 monotonic, align 8, !dbg [[DBG:![0-9]+]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG]]
 
 define dso_local void @empty() !dbg !5 {