diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1277,6 +1277,8 @@
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+
+  setMaxAtomicSizeInBitsSupported(128);
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
@@ -16394,31 +16396,36 @@
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
+// Loads and stores less than 128-bits are supported directly; 128-bit
+// ones can only be done via ldaxp/stlxp sequences, so must be expanded.
 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
   return Size == 128;
 }
 
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
+// Loads and stores less than 128-bits are supported directly; 128-bit
+// ones can only be done via ldaxp/stlxp sequences, so must be expanded.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
 }
 
-// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+// The "default" for integer RMW operations is to expand to an LL/SC loop.
+// However, with the LSE instructions (or outline-atomics mode, which provides
+// library routines in place of the LSE-instructions), we can directly emit many
+// operations instead.
+//
+// Floating-point operations are always emitted to a cmpxchg loop, because they
+// may trigger a trap which aborts an LLSC sequence.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (AI->isFloatingPointOperation())
     return AtomicExpansionKind::CmpXChg;
 
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  if (Size > 128) return AtomicExpansionKind::None;
+  assert(Size <= 128 && "AtomicExpandPass should've handled large sizes.");
+
   // Nand not supported in LSE.
   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
   // Leave 128 bits to LLSC.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
-; RUN: llc -O3 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=O3
+; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O3 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=O3
 
 ; This file checks that the translation from llvm IR to generic MachineInstr
 ; is correct.
@@ -2048,190 +2048,126 @@
 }
 
 ; Try a monotonic atomicrmw xchg
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_xchg(i256* %addr) {
+define i32 @test_atomicrmw_xchg(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_xchg
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_XCHG [[ADDR]](p0), [[VAL]] :: (load store monotonic 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw xchg i256* %addr, i256 1 monotonic
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_XCHG [[ADDR]](p0), [[VAL]] :: (load store monotonic 4 on %ir.addr)
+  %oldval = atomicrmw xchg i32* %addr, i32 1 monotonic
+  ret i32 %oldval
 }
 
 ; Try an acquire atomicrmw add
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_add(i256* %addr) {
+define i32 @test_atomicrmw_add(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_add
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_ADD [[ADDR]](p0), [[VAL]] :: (load store acquire 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw add i256* %addr, i256 1 acquire
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_ADD [[ADDR]](p0), [[VAL]] :: (load store acquire 4 on %ir.addr)
+  %oldval = atomicrmw add i32* %addr, i32 1 acquire
+  ret i32 %oldval
 }
 
 ; Try a release atomicrmw sub
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_sub(i256* %addr) {
+define i32 @test_atomicrmw_sub(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_sub
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_SUB [[ADDR]](p0), [[VAL]] :: (load store release 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw sub i256* %addr, i256 1 release
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_SUB [[ADDR]](p0), [[VAL]] :: (load store release 4 on %ir.addr)
+  %oldval = atomicrmw sub i32* %addr, i32 1 release
+  ret i32 %oldval
 }
 
 ; Try an acq_rel atomicrmw and
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_and(i256* %addr) {
+define i32 @test_atomicrmw_and(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_and
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_AND [[ADDR]](p0), [[VAL]] :: (load store acq_rel 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw and i256* %addr, i256 1 acq_rel
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
-}
-
-; Try an seq_cst atomicrmw nand
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_nand(i256* %addr) {
-; CHECK-LABEL: name: test_atomicrmw_nand
-; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
-; CHECK-NEXT:  liveins: $x0
-; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_NAND [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw nand i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_AND [[ADDR]](p0), [[VAL]] :: (load store acq_rel 4 on %ir.addr)
+  %oldval = atomicrmw and i32* %addr, i32 1 acq_rel
+  ret i32 %oldval
 }
 
+;; An atomicrmw nand is never passed to GlobalISel by the AArch64 target.
+;; It's always expanded in IR.
+
 ; Try an seq_cst atomicrmw or
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_or(i256* %addr) {
+define i32 @test_atomicrmw_or(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_or
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_OR [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw or i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_OR [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw or i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw xor
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_xor(i256* %addr) {
+define i32 @test_atomicrmw_xor(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_xor
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_XOR [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw xor i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_XOR [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw xor i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw min
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_min(i256* %addr) {
+define i32 @test_atomicrmw_min(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_min
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_MIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw min i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_MIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw min i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw max
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_max(i256* %addr) {
+define i32 @test_atomicrmw_max(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_max
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_MAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw max i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_MAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw max i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw unsigned min
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_umin(i256* %addr) {
+define i32 @test_atomicrmw_umin(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_umin
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_UMIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw umin i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_UMIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw umin i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw unsigned max
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
-define i32 @test_atomicrmw_umax(i256* %addr) {
+define i32 @test_atomicrmw_umax(i32* %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_umax
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_UMAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst 32 on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw umax i256* %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_UMAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst 4 on %ir.addr)
+  %oldval = atomicrmw umax i32* %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 @addr = global i8* null