diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -58,6 +58,7 @@
   /// of the extension
   AssertSext,
   AssertZext,
+  AssertAlign,
 
   /// Various leaf nodes.
   BasicBlock,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1342,6 +1342,9 @@
   /// Return a freeze using the SDLoc of the value operand.
   SDValue getFreeze(SDValue V);
 
+  /// Return an AssertAlignSDNode.
+  SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A);
+
   /// Return the specified value casted to
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -2526,6 +2526,22 @@
   }
 };
 
+/// An SDNode that records if a register contains a value that is guaranteed to
+/// be aligned accordingly.
+class AssertAlignSDNode : public SDNode {
+  Align Alignment;
+
+public:
+  AssertAlignSDNode(unsigned Order, const DebugLoc &DL, EVT VT, Align A)
+      : SDNode(ISD::AssertAlign, Order, DL, getSDVTList(VT)), Alignment(A) {}
+
+  Align getAlign() const { return Alignment; }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::AssertAlign;
+  }
+};
+
 class SDNodeIterator : public std::iterator<std::forward_iterator_tag,
                                             SDNode, ptrdiff_t> {
   const SDNode *Node;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -667,10 +667,11 @@
 def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN",
                                 SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>;
 
-def SDT_assertext : SDTypeProfile<1, 1,
+def SDT_assert : SDTypeProfile<1, 1,
   [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
-def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
-def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
+def assertsext : SDNode<"ISD::AssertSext", SDT_assert>;
+def assertzext : SDNode<"ISD::AssertZext", SDT_assert>;
+def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -455,6 +455,7 @@
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
     SDValue visitAssertExt(SDNode *N);
+    SDValue visitAssertAlign(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
@@ -1601,6 +1602,7 @@
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
   case ISD::AssertSext:
   case ISD::AssertZext:         return visitAssertExt(N);
+  case ISD::AssertAlign:        return visitAssertAlign(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -10696,6 +10698,45 @@
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
+  SDLoc DL(N);
+
+  Align AL = cast<AssertAlignSDNode>(N)->getAlign();
+  SDValue N0 = N->getOperand(0);
+
+  // Fold (assertalign (assertalign x, AL0), AL1) ->
+  // (assertalign x, max(AL0, AL1))
+  if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
+    return DAG.getAssertAlign(DL, N0.getOperand(0),
+                              std::max(AL, AAN->getAlign()));
+
+  // In rare cases, there are trivial arithmetic ops in source operands. Sink
+  // this assert down to source operands so that those arithmetic ops could be
+  // exposed to the DAG combining.
+  switch (N0.getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB: {
+    unsigned AlignShift = Log2(AL);
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+    unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
+    unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
+    if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
+      if (LHSAlignShift < AlignShift)
+        LHS = DAG.getAssertAlign(DL, LHS, AL);
+      if (RHSAlignShift < AlignShift)
+        RHS = DAG.getAssertAlign(DL, RHS, AL);
+      return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
 /// If the result of a wider load is shifted to right of N  bits and then
 /// truncated to a narrower type and where N is a multiple of number of bits of
 /// the narrower type, transform it to a narrower load from address + N / num of
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3166,6 +3166,15 @@
     Known.One  &= (~Known.Zero);
     break;
   }
+  case ISD::AssertAlign: {
+    unsigned LogOfAlign = Log2(cast<AssertAlignSDNode>(Op)->getAlign());
+    assert(LogOfAlign != 0);
+    // If a node is guaranteed to be aligned, set low zero bits accordingly as
+    // well as clearing one bits.
+    Known.Zero.setLowBits(LogOfAlign);
+    Known.One.clearLowBits(LogOfAlign);
+    break;
+  }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
     Known.Zero.setBitsFrom(1);
@@ -5186,6 +5195,34 @@
   return SDValue();
 }
 
+SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) {
+  assert(Val.getValueType().isInteger() && "Invalid AssertAlign!");
+
+  // There's no need to assert on a byte-aligned pointer. All pointers are at
+  // least byte aligned.
+  if (A == Align(1))
+    return Val;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::AssertAlign, getVTList(Val.getValueType()), {Val});
+  ID.AddInteger(A.value());
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<AssertAlignSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                         Val.getValueType(), A);
+  createOperands(N, {Val});
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -135,6 +135,11 @@
 /// some float libcalls (6, 8 or 12 bits).
 static unsigned LimitFloatPrecision;
 
+static cl::opt<bool>
+    InsertAssertAlign("insert-assert-align", cl::init(true),
+                      cl::desc("Insert the experimental `assertalign` node."),
+                      cl::ReallyHidden);
+
 static cl::opt<unsigned, true>
     LimitFPPrecision("limit-float-precision",
                      cl::desc("Generate low-precision inline sequences "
@@ -4747,6 +4752,15 @@
     } else
       Result = lowerRangeToAssertZExt(DAG, I, Result);
 
+    MaybeAlign Alignment = I.getRetAlign();
+    if (!Alignment)
+      Alignment = F->getAttributes().getRetAlignment();
+    // Insert `assertalign` node if there's an alignment.
+    if (InsertAssertAlign && Alignment) {
+      Result =
+          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+    }
+
     setValue(&I, Result);
   }
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -106,6 +106,7 @@
   case ISD::TokenFactor:                return "TokenFactor";
   case ISD::AssertSext:                 return "AssertSext";
   case ISD::AssertZext:                 return "AssertZext";
+  case ISD::AssertAlign:                return "AssertAlign";
 
   case ISD::BasicBlock:                 return "BasicBlock";
   case ISD::VALUETYPE:                  return "ValueType";
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2820,6 +2820,7 @@
     return;
   case ISD::AssertSext:
   case ISD::AssertZext:
+  case ISD::AssertAlign:
     ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
     CurDAG->RemoveDeadNode(NodeToMatch);
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1628,6 +1628,37 @@
   llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
 
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+                                          SDValue &N0, SDValue &N1) {
+  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+    // (i64 (bitcast (v2i32 (build_vector
+    //                        (or (extract_vector_elt V, 0), OFFSET),
+    //                        (extract_vector_elt V, 1)))))
+    SDValue Lo = Addr.getOperand(0).getOperand(0);
+    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+      SDValue BaseLo = Lo.getOperand(0);
+      SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+      // Check that split base (Lo and Hi) are extracted from the same one.
+      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+          // Lo is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+          BaseLo.getConstantOperandVal(1) == 0 &&
+          // Hi is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+          BaseHi.getConstantOperandVal(1) == 1) {
+        N0 = BaseLo.getOperand(0).getOperand(0);
+        N1 = Lo.getOperand(1);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 template <bool IsSigned>
 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
                                           SDValue Addr,
@@ -1638,84 +1669,91 @@
 
   if (Subtarget->hasFlatInstOffsets() &&
       (!Subtarget->hasFlatSegmentOffsetBug() ||
-       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
-      CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
-    const SIInstrInfo *TII = Subtarget->getInstrInfo();
-    unsigned AS = findMemSDNode(N)->getAddressSpace();
-    if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
-      Addr = N0;
-      OffsetVal = COffsetVal;
-    } else {
-      // If the offset doesn't fit, put the low bits into the offset field and
-      // add the rest.
-
-      SDLoc DL(N);
-      uint64_t ImmField;
-      const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
-      if (IsSigned) {
-        ImmField = SignExtend64(COffsetVal, NumBits);
-
-        // Don't use a negative offset field if the base offset is positive.
-        // Since the scheduler currently relies on the offset field, doing so
-        // could result in strange scheduling decisions.
-
-        // TODO: Should we not do this in the opposite direction as well?
-        if (static_cast<int64_t>(COffsetVal) > 0) {
-          if (static_cast<int64_t>(ImmField) < 0) {
-            const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
-            ImmField = COffsetVal & OffsetMask;
+       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+    SDValue N0, N1;
+    if (CurDAG->isBaseWithConstantOffset(Addr)) {
+      N0 = Addr.getOperand(0);
+      N1 = Addr.getOperand(1);
+    } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+      assert(N0 && N1 && isa<ConstantSDNode>(N1));
+    }
+    if (N0 && N1) {
+      uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+      const SIInstrInfo *TII = Subtarget->getInstrInfo();
+      unsigned AS = findMemSDNode(N)->getAddressSpace();
+      if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+        Addr = N0;
+        OffsetVal = COffsetVal;
+      } else {
+        // If the offset doesn't fit, put the low bits into the offset field and
+        // add the rest.
+
+        SDLoc DL(N);
+        uint64_t ImmField;
+        const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+        if (IsSigned) {
+          ImmField = SignExtend64(COffsetVal, NumBits);
+
+          // Don't use a negative offset field if the base offset is positive.
+          // Since the scheduler currently relies on the offset field, doing so
+          // could result in strange scheduling decisions.
+
+          // TODO: Should we not do this in the opposite direction as well?
+          if (static_cast<int64_t>(COffsetVal) > 0) {
+            if (static_cast<int64_t>(ImmField) < 0) {
+              const uint64_t OffsetMask =
+                  maskTrailingOnes<uint64_t>(NumBits - 1);
+              ImmField = COffsetVal & OffsetMask;
+            }
           }
+        } else {
+          // TODO: Should we do this for a negative offset?
+          const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+          ImmField = COffsetVal & OffsetMask;
         }
-      } else {
-        // TODO: Should we do this for a negative offset?
-        const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
-        ImmField = COffsetVal & OffsetMask;
-      }
 
-      uint64_t RemainderOffset = COffsetVal - ImmField;
+        uint64_t RemainderOffset = COffsetVal - ImmField;
 
-      assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
-      assert(RemainderOffset + ImmField == COffsetVal);
+        assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+        assert(RemainderOffset + ImmField == COffsetVal);
 
-      OffsetVal = ImmField;
+        OffsetVal = ImmField;
 
-      // TODO: Should this try to use a scalar add pseudo if the base address is
-      // uniform and saddr is usable?
-      SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-      SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+        // TODO: Should this try to use a scalar add pseudo if the base address
+        // is uniform and saddr is usable?
+        SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+        SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
 
-      SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                            DL, MVT::i32, N0, Sub0);
-      SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                            DL, MVT::i32, N0, Sub1);
+        SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                              MVT::i32, N0, Sub0);
+        SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                              MVT::i32, N0, Sub1);
 
-      SDValue AddOffsetLo
-        = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-      SDValue AddOffsetHi
-        = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+        SDValue AddOffsetLo =
+            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+        SDValue AddOffsetHi =
+            getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
 
-      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-      SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
-      SDNode *Add = CurDAG->getMachineNode(
-        AMDGPU::V_ADD_I32_e64, DL, VTs,
-        {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+        SDNode *Add =
+            CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
+                                   {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
 
-      SDNode *Addc = CurDAG->getMachineNode(
-        AMDGPU::V_ADDC_U32_e64, DL, VTs,
-        {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+        SDNode *Addc = CurDAG->getMachineNode(
+            AMDGPU::V_ADDC_U32_e64, DL, VTs,
+            {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
 
-      SDValue RegSequenceArgs[] = {
-        CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-        SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
-      };
+        SDValue RegSequenceArgs[] = {
+            CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+            SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
 
-      Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                            MVT::i64, RegSequenceArgs), 0);
+        Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                              MVT::i64, RegSequenceArgs),
+                       0);
+      }
     }
   }
 
@@ -1824,15 +1862,21 @@
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
   // wraparound, because s_load instructions perform the addition in 64 bits.
   if ((Addr.getValueType() != MVT::i32 ||
-       Addr->getFlags().hasNoUnsignedWrap()) &&
-      (CurDAG->isBaseWithConstantOffset(Addr) ||
-       Addr.getOpcode() == ISD::ADD)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-
-    if (SelectSMRDOffset(N1, Offset, Imm)) {
-      SBase = Expand32BitAddress(N0);
-      return true;
+       Addr->getFlags().hasNoUnsignedWrap())) {
+    SDValue N0, N1;
+    // Extract the base and offset if possible.
+    if (CurDAG->isBaseWithConstantOffset(Addr) ||
+        Addr.getOpcode() == ISD::ADD) {
+      N0 = Addr.getOperand(0);
+      N1 = Addr.getOperand(1);
+    } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+      assert(N0 && N1 && isa<ConstantSDNode>(N1));
+    }
+    if (N0 && N1) {
+      if (SelectSMRDOffset(N1, Offset, Imm)) {
+        SBase = Expand32BitAddress(N0);
+        return true;
+      }
     }
   }
   SBase = Expand32BitAddress(Addr);
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -30,10 +30,9 @@
 define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_add_u32 s0, s4, 14
-; HAWAII-NEXT:    s_addc_u32 s1, s5, 0
+; HAWAII-NEXT:    s_or_b32 s0, s4, 14
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s5
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
 ; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
@@ -52,27 +51,26 @@
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_or_b32 s0, s4, 14
+; FIJI-NEXT:    v_mov_b32_e32 v0, s0
+; FIJI-NEXT:    v_mov_b32_e32 v1, s5
+; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT:    s_load_dword s2, s[4:5], 0x8
-; FIJI-NEXT:    s_load_dword s1, s[4:5], 0xc
+; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; FIJI-NEXT:    s_mov_b32 m0, -1
 ; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIJI-NEXT:    v_mov_b32_e32 v2, s0
-; FIJI-NEXT:    s_and_b32 s3, s1, 0xffff
-; FIJI-NEXT:    s_add_u32 s0, s4, 14
+; FIJI-NEXT:    v_mov_b32_e32 v1, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v3, s1
-; FIJI-NEXT:    s_addc_u32 s1, s5, 0
-; FIJI-NEXT:    v_mov_b32_e32 v0, s0
-; FIJI-NEXT:    v_mov_b32_e32 v1, s1
-; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT:    ds_write_b16 v2, v3 offset:4
-; FIJI-NEXT:    v_mov_b32_e32 v3, s2
-; FIJI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; FIJI-NEXT:    s_and_b32 s3, s2, 0xffff
+; FIJI-NEXT:    v_mov_b32_e32 v2, s2
+; FIJI-NEXT:    ds_write_b16 v1, v2 offset:4
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
 ; FIJI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; FIJI-NEXT:    v_or_b32_e32 v0, s3, v0
 ; FIJI-NEXT:    v_bfe_u32 v0, v0, 16, 7
-; FIJI-NEXT:    ds_write_b8 v2, v0 offset:6
-; FIJI-NEXT:    ds_write_b32 v2, v3
+; FIJI-NEXT:    ds_write_b8 v1, v0 offset:6
+; FIJI-NEXT:    ds_write_b32 v1, v3
 ; FIJI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: local_store_i55: