Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2985,6 +2985,10 @@
       // Only the first lane is executes, so readfirstlane is safe.
       substituteSimpleCopyRegs(OpdMapper, 1);
       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+      if (Subtarget.needsAlignedVGPRs()) {
+        Register DataReg = MI.getOperand(1).getReg();
+        constrainGenericRegister(DataReg, AMDGPU::VReg_32_Align2RegClass, MRI);
+      }
       return;
     }
     case Intrinsic::amdgcn_ds_gws_sema_v:
Index: llvm/lib/Target/AMDGPU/DSInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/DSInstructions.td
+++ llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -373,6 +373,7 @@
 
   let has_gws_data0 = 1;
   let hasSideEffects = 1;
+  let hasPostISelHook = 1;
 }
 
 class DS_VOID <string opName> : DS_Pseudo<opName,
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11420,6 +11420,23 @@
 
   if (TII->isMIMG(MI) && !MI.mayStore())
     AddIMGInit(MI);
+
+  if (Subtarget->needsAlignedVGPRs()) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+    case AMDGPU::DS_GWS_INIT:
+    case AMDGPU::DS_GWS_SEMA_BR:
+    case AMDGPU::DS_GWS_BARRIER: {
+      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+      Register Reg = TII->getNamedOperand(MI, AMDGPU::OpName::data0)->getReg();
+      MRI.setRegClass(Reg, TRI->isAGPR(MRI, Reg)
+                               ? &AMDGPU::AReg_32_Align2RegClass
+                               : &AMDGPU::VReg_32_Align2RegClass);
+      break;
+    }
+    }
+  }
 }
 
 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4675,7 +4675,21 @@
     assert(DefinedRC);
     if (!isLegalRegOperand(MRI, OpInfo, *MO))
       return false;
-    bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
+    const MachineOperand &CurOp = MI.getOperand(OpIdx);
+    const auto *RC = RI.getRegClassForReg(MRI, MO->getReg());
+    if (ST.needsAlignedVGPRs() && CurOp.isReg() && &CurOp != MO) {
+      const auto *CurRC = RI.getRegClassForReg(MRI, CurOp.getReg());
+      if (RI.isAlignedRC(CurRC) && RI.hasVectorRegisters(RC)) {
+        unsigned Sub = MO->getSubReg();
+        if (Sub &&
+            !RI.getCompatibleSubRegClass(RI.getEquivalentVGPRClass(RC),
+                                         RI.getEquivalentVGPRClass(CurRC), Sub))
+          return false;
+        if (!Sub && !RI.isAlignedRC(RC))
+          return false;
+      }
+    }
+    bool IsAGPR = RI.hasAGPRs(RC);
     if (IsAGPR && !ST.hasMAIInsts())
       return false;
     unsigned Opc = MI.getOpcode();
Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -143,6 +143,9 @@
   LLVM_READONLY
   static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
 
+  LLVM_READONLY
+  bool isAlignedRC(const TargetRegisterClass *RC) const;
+
   /// Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const;
Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1886,6 +1886,8 @@
 
 static const TargetRegisterClass *
 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 32)
+    return &AMDGPU::VReg_32_Align2RegClass;
   if (BitWidth <= 64)
     return &AMDGPU::VReg_64_Align2RegClass;
   if (BitWidth <= 96)
@@ -1942,6 +1944,8 @@
 
 static const TargetRegisterClass *
 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 32)
+    return &AMDGPU::AReg_32_Align2RegClass;
   if (BitWidth <= 64)
     return &AMDGPU::AReg_64_Align2RegClass;
   if (BitWidth <= 96)
@@ -1998,6 +2002,35 @@
   return nullptr;
 }
 
+bool SIRegisterInfo::isAlignedRC(const TargetRegisterClass *RC) const {
+  if (!RC)
+    return false;
+
+  switch (RC->getID()) {
+  default:
+    return false;
+  case AMDGPU::VReg_32_Align2RegClassID:
+  case AMDGPU::VReg_64_Align2RegClassID:
+  case AMDGPU::VReg_96_Align2RegClassID:
+  case AMDGPU::VReg_128_Align2RegClassID:
+  case AMDGPU::VReg_160_Align2RegClassID:
+  case AMDGPU::VReg_192_Align2RegClassID:
+  case AMDGPU::VReg_256_Align2RegClassID:
+  case AMDGPU::VReg_512_Align2RegClassID:
+  case AMDGPU::VReg_1024_Align2RegClassID:
+  case AMDGPU::AReg_32_Align2RegClassID:
+  case AMDGPU::AReg_64_Align2RegClassID:
+  case AMDGPU::AReg_96_Align2RegClassID:
+  case AMDGPU::AReg_128_Align2RegClassID:
+  case AMDGPU::AReg_160_Align2RegClassID:
+  case AMDGPU::AReg_192_Align2RegClassID:
+  case AMDGPU::AReg_256_Align2RegClassID:
+  case AMDGPU::AReg_512_Align2RegClassID:
+  case AMDGPU::AReg_1024_Align2RegClassID:
+    return true;
+  }
+}
+
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *
@@ -2011,6 +2044,8 @@
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::AGPR_32RegClass,
     &AMDGPU::AGPR_32RegClass,
+    &AMDGPU::VReg_32_Align2RegClass,
+    &AMDGPU::AReg_32_Align2RegClass,
     &AMDGPU::VReg_64_Align2RegClass,
     &AMDGPU::VReg_64RegClass,
     &AMDGPU::SReg_64RegClass,
@@ -2105,7 +2140,9 @@
 const TargetRegisterClass *
 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
   unsigned Size = getRegSizeInBits(*SRC);
-  const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+  const TargetRegisterClass *VRC = isAlignedRC(SRC)
+                                       ? getAlignedVGPRClassForBitWidth(Size)
+                                       : getVGPRClassForBitWidth(Size);
   assert(VRC && "Invalid register class size");
   return VRC;
 }
@@ -2113,7 +2150,9 @@
 const TargetRegisterClass *
 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
   unsigned Size = getRegSizeInBits(*SRC);
-  const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
+  const TargetRegisterClass *ARC = isAlignedRC(SRC)
+                                       ? getAlignedAGPRClassForBitWidth(Size)
+                                       : getAGPRClassForBitWidth(Size);
   assert(ARC && "Invalid register class size");
   return ARC;
 }
Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -806,6 +806,8 @@
   def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
 }
 
+def VReg_32_Align2 : VRegClassBase<1, [OtherVT], (decimate VGPR_32, 2)>;
+
 defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
 defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
@@ -827,6 +829,8 @@
   }
 }
 
+def AReg_32_Align2 : VRegClassBase<1, [OtherVT], (decimate AGPR_32, 2)>;
+
 defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
                         (add AGPR_64)>;
 defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1604,6 +1604,8 @@
   case AMDGPU::SReg_32RegClassID:
   case AMDGPU::SReg_32_XM0RegClassID:
   case AMDGPU::SRegOrLds_32RegClassID:
+  case AMDGPU::VReg_32_Align2RegClassID:
+  case AMDGPU::AReg_32_Align2RegClassID:
     return 32;
   case AMDGPU::SGPR_64RegClassID:
   case AMDGPU::VS_64RegClassID:
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -138,7 +138,7 @@
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK:   liveins: $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2
+  ; CHECK:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 3014666 /* regdef:VReg_64 */, def %2
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s64) = COPY %2
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
Index: llvm/test/CodeGen/AMDGPU/ds_gws_align.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/ds_gws_align.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+
+; GCN-LABEL: {{^}}gws_init_odd_reg:
+; GFX908-DAG: ds_gws_init v1 gds
+; GFX90A-DAG: ds_gws_init v2 gds
+; GCN-DAG:    ds_gws_init v0 gds
+define amdgpu_ps void @gws_init_odd_reg(<2 x i32> %arg) {
+  %vgpr.0 = extractelement <2 x i32> %arg, i32 0
+  %vgpr.1 = extractelement <2 x i32> %arg, i32 1
+  call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.0, i32 0)
+  call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.1, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}gws_sema_br_odd_reg:
+; GFX908-DAG: ds_gws_sema_br v1 gds
+; GFX90A-DAG: ds_gws_sema_br v2 gds
+; GCN-DAG:    ds_gws_sema_br v0 gds
+define amdgpu_ps void @gws_sema_br_odd_reg(<2 x i32> %arg) {
+  %vgpr.0 = extractelement <2 x i32> %arg, i32 0
+  %vgpr.1 = extractelement <2 x i32> %arg, i32 1
+  call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.0, i32 0)
+  call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.1, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}gws_barrier_odd_reg:
+; GFX908-DAG: ds_gws_barrier v1 gds
+; GFX90A-DAG: ds_gws_barrier v2 gds
+; GCN-DAG:    ds_gws_barrier v0 gds
+define amdgpu_ps void @gws_barrier_odd_reg(<2 x i32> %arg) {
+  %vgpr.0 = extractelement <2 x i32> %arg, i32 0
+  %vgpr.1 = extractelement <2 x i32> %arg, i32 1
+  call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.0, i32 0)
+  call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.1, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}gws_init_odd_agpr:
+; GFX908-COUNT-2: ds_gws_init v{{[0-9]+}} gds
+; GFX90A:         ds_gws_init a{{[0-9]*[02468]}} gds
+; GFX90A:         ds_gws_init v{{[0-9]*[02468]}} gds
+define amdgpu_ps void @gws_init_odd_agpr(<4 x i32> %arg) {
+bb:
+  %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
+  %agpr.0 = extractelement <4 x i32> %mai, i32 0
+  %agpr.1 = extractelement <4 x i32> %mai, i32 1
+  call void @llvm.amdgcn.ds.gws.init(i32 %agpr.0, i32 0)
+  call void @llvm.amdgcn.ds.gws.init(i32 %agpr.1, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.init(i32, i32)
+declare void @llvm.amdgcn.ds.gws.sema.br(i32, i32)
+declare void @llvm.amdgcn.ds.gws.barrier(i32, i32)
+declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
Index: llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4
+  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:SGPR_128 */, def %4
   ; GFX908:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4
+  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:SGPR_128 */, def %4
   ; GFX90A:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -26,15 +26,15 @@
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4390922 /* regdef:VReg_128 */, def %4
+  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:VReg_128 */, def %4
   ; GFX908:   [[COPY:%[0-9]+]]:vreg_128 = COPY %4
-  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4390921 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4521994 /* regdef:VReg_128_Align2 */, def %4
+  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:VReg_128_Align2 */, def %4
   ; GFX90A:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
-  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -44,15 +44,15 @@
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4325386 /* regdef:AReg_128 */, def %4
+  ; GFX908:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4587530 /* regdef:AReg_128 */, def %4
   ; GFX908:   [[COPY:%[0-9]+]]:areg_128 = COPY %4
-  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4456458 /* regdef:AReg_128_Align2 */, def %4
+  ; GFX90A:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:AReg_128_Align2 */, def %4
   ; GFX90A:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
-  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4456457 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)