Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -254,6 +254,9 @@
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  bool isMemOpUniform(const SDNode *N) const;
+  bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
 };
 
 namespace AMDGPUISD {
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2480,9 +2480,9 @@
 
   unsigned Size = VT.getStoreSize();
   unsigned Align = LN->getAlignment();
+  unsigned AS = LN->getAddressSpace();
   if (Align < Size && isTypeLegal(VT)) {
     bool IsFast;
-    unsigned AS = LN->getAddressSpace();
 
     // Expand unaligned loads earlier than legalization. Due to visitation order
     // problems during legalization, the emitted instructions to pack and unpack
@@ -2500,6 +2500,77 @@
       return SDValue();
   }
 
+  // Create DWORDX3 loads. We cannot create it later because legalizer will
+  // split it and there is no way to specify custom lowering.
+  bool IsGlobal = (AS == AMDGPUASI.GLOBAL_ADDRESS);
+  bool IsConstant = (AS == AMDGPUASI.CONSTANT_ADDRESS);
+  bool IsGlobalOrConstant = IsGlobal || IsConstant;
+  // TODO: support vec3 stores and move the logig of this condition into
+  //       shouldCombineMemoryType().
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
+      VT.isExtended() && VT.isVector() && VT.getVectorNumElements() == 3 &&
+      // There are no sub-dword vector loads.
+      VT.getVectorElementType().getStoreSize() == 4 &&
+      // There are no vector extloads.
+      LN->getExtensionType() == ISD::LoadExtType::NON_EXTLOAD &&
+      ((Subtarget->useFlatForGlobal() && IsGlobalOrConstant) ||
+       AS == AMDGPUASI.FLAT_ADDRESS) &&
+      // Uniform const loads will be selected to scalar loads, which do not have
+      // DWORDX3 form.
+      !((IsConstant || (IsGlobal && Subtarget->getScalarizeGlobalBehavior() &&
+                                    isMemOpHasNoClobberedMemOperand(LN))) &&
+        isMemOpUniform(LN))) {
+    SDValue ZeroFlag = DAG.getTargetConstant(0, SL, MVT::i1); // GLC/SLC
+    SDValue Ptr = LN->getBasePtr();
+    SDValue Offset = LN->getOffset();
+
+    int64_t OffVal = 0;
+    if (auto OffC = dyn_cast<ConstantSDNode>(Offset))
+      OffVal = OffC->getSExtValue();
+    // GFX9: Imm offset: Scratch, Global: 13-bit signed byte offset
+    //                   FLAT: 12-bit unsigned offset (MSB is ignored)
+    // TODO: It does not seem to be possible to get any offset after
+    //       SelectionDAGBuilder.
+    if ((OffVal && (!Subtarget->hasFlatInstOffsets() ||
+                    (IsGlobalOrConstant && !isInt<13>(OffVal)) ||
+                    !isUInt<12>(OffVal))) ||
+        // Is that possible to get non-constant offset recorded in LoadSDNode?
+        (!OffVal && !Offset.isUndef())) {
+      Ptr = DAG.getNode(ISD::ADD, SL, Ptr.getValueType(), Ptr, Offset);
+      OffVal = 0;
+    }
+    Offset = DAG.getTargetConstant(OffVal, SL, MVT::i16);
+
+    // TODO: introduce AMDGPUISD::LOAD3 returning v4i32 and select it later
+    //       to allow proper non-constant offset folding with GFX9 flat/global
+    //       instructions and with buffer_load_dwordx3.
+    //       That is in case if we are interested in supporting MUBUF or
+    //       VGPR offsets with SGPR base on GFX9. Both are unclear.
+    //       However, SelectionDAGBuilder does not really record an offset
+    //       even if constant, so we still want to get that constant offset
+    //       and we do not want to replicate SelectADDR/MUBUFOffset code here.
+    unsigned Opc = AMDGPU::FLAT_LOAD_DWORDX3;
+
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9 &&
+        IsGlobalOrConstant)
+      Opc = AMDGPU::GLOBAL_LOAD_DWORDX3;
+
+    // We must return a legal v4 type because DAG legalizer cannot widen machine
+    // nodes results, but knowns how to widen BUILD_VECTOR.
+    EVT V4VT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+    auto NewLoad = DAG.getMachineNode(Opc, SL, V4VT, N->getValueType(1),
+                                      { Ptr, Offset, ZeroFlag, ZeroFlag });
+
+    auto MMOs = DAG.getMachineFunction().allocateMemRefsArray(1);
+    *MMOs = LN->getMemOperand();
+    NewLoad->setMemRefs(MMOs, MMOs + 1);
+
+    SmallVector<SDValue, 3> Elts;
+    DAG.ExtractVectorElements(SDValue(NewLoad, 0), Elts, 0, 3);
+    SDValue V3 = DAG.getBuildVector(VT, SL, { Elts[0], Elts[1], Elts[2] });
+    return DAG.getMergeValues({ V3, SDValue(NewLoad, 1) }, SL);
+  }
+
   if (!shouldCombineMemoryType(VT))
     return SDValue();
 
@@ -3793,3 +3864,17 @@
     return 1;
   }
 }
+
+bool AMDGPUTargetLowering::isMemOpUniform(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+
+  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+}
+
+bool AMDGPUTargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N)
+  const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.noclobber");
+}
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -166,8 +166,6 @@
                           bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
-  bool isMemOpUniform(const SDNode *N) const;
-  bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
   bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -819,13 +819,6 @@
          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
 }
 
-bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
-  const MemSDNode *MemNode = cast<MemSDNode>(N);
-  const Value *Ptr = MemNode->getMemOperand()->getValue();
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.noclobber");
-}
-
 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
@@ -836,12 +829,6 @@
   return isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
-bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
-  const MemSDNode *MemNode = cast<MemSDNode>(N);
-
-  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
-}
-
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
Index: test/CodeGen/AMDGPU/load-global-f32.ll
===================================================================
--- test/CodeGen/AMDGPU/load-global-f32.ll
+++ test/CodeGen/AMDGPU/load-global-f32.ll
@@ -31,7 +31,7 @@
 
 ; FUNC-LABEL: {{^}}global_load_v3f32:
 ; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx3
 
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
Index: test/CodeGen/AMDGPU/load-global-i32.ll
===================================================================
--- test/CodeGen/AMDGPU/load-global-i32.ll
+++ test/CodeGen/AMDGPU/load-global-i32.ll
@@ -30,7 +30,7 @@
 
 ; FUNC-LABEL: {{^}}global_load_v3i32:
 ; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx3
 
 ; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
Index: test/CodeGen/AMDGPU/load-vec3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/load-vec3.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-MUBUF %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}load_global_v3i32:
+; VI:   flat_load_dwordx3
+; GFX9: global_load_dwordx3
+; GCN-MUBUF-DAG: buffer_load_dwordx2 v
+; GCN-MUBUF-DAG: buffer_load_dword v
+define amdgpu_kernel void @load_global_v3i32(float addrspace(1)* nocapture readonly %in, <3 x float> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 %id
+  %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x i32> addrspace(1)*
+  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep_in_v3, align 4
+  %gep_out = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %out, i32 %id
+  %vec3i = bitcast <3 x i32> %load to <3 x float>
+  store <3 x float> %vec3i, <3 x float> addrspace(1)* %gep_out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_global_v3f32:
+; VI:   flat_load_dwordx3
+; GFX9: global_load_dwordx3
+; GCN-MUBUF-DAG: buffer_load_dwordx2 v
+; GCN-MUBUF-DAG: buffer_load_dword v
+define amdgpu_kernel void @load_global_v3f32(float addrspace(1)* nocapture readonly %in, <3 x float> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 %id
+  %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x float> addrspace(1)*
+  %load = load <3 x float>, <3 x float> addrspace(1)* %gep_in_v3, align 4
+  %val = fadd <3 x float> %load, %load
+  %gep_out = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %out, i32 %id
+  store <3 x float> %val, <3 x float> addrspace(1)* %gep_out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_constant_v3i32:
+; VI:   flat_load_dwordx3
+; GFX9: global_load_dwordx3
+; GCN-MUBUF-DAG: buffer_load_dwordx2 v
+; GCN-MUBUF-DAG: buffer_load_dword v
+define amdgpu_kernel void @load_constant_v3i32(i32 addrspace(2)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %id
+  %gep_in_v3 = bitcast i32 addrspace(2)* %gep_in to <3 x i32> addrspace(2)*
+  %load = load <3 x i32>, <3 x i32> addrspace(2)* %gep_in_v3, align 4
+  %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id
+  store <3 x i32> %load, <3 x i32> addrspace(1)* %gep_out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_v3i32:
+; GCN: flat_load_dwordx3
+define amdgpu_kernel void @load_flat_v3i32(i32 addrspace(4)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %id
+  %gep_in_v3 = bitcast i32 addrspace(4)* %gep_in to <3 x i32> addrspace(4)*
+  %load = load <3 x i32>, <3 x i32> addrspace(4)* %gep_in_v3, align 4
+  %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id
+  store <3 x i32> %load, <3 x i32> addrspace(1)* %gep_out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_global_v3f16:
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN-NOT: load_dwordx3
+define amdgpu_kernel void @load_global_v3f16(half addrspace(1)* nocapture readonly %in, <3 x half> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds half, half addrspace(1)* %in, i32 %id
+  %gep_in_v3 = bitcast half addrspace(1)* %gep_in to <3 x half> addrspace(1)*
+  %load = load <3 x half>, <3 x half> addrspace(1)* %gep_in_v3, align 2
+  %val = fadd <3 x half> %load, %load
+  %gep_out = getelementptr inbounds <3 x half>, <3 x half> addrspace(1)* %out, i32 %id
+  store <3 x half> %val, <3 x half> addrspace(1)* %gep_out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_global_v3i16_to_v3i32:
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN: {{buffer|flat|global}}_load_ushort v
+; GCN-NOT: load_dwordx3
+define amdgpu_kernel void @load_global_v3i16_to_v3i32(i16 addrspace(1)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep_in = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %id
+  %gep_in_v3 = bitcast i16 addrspace(1)* %gep_in to <3 x i16> addrspace(1)*
+  %load = load <3 x i16>, <3 x i16> addrspace(1)* %gep_in_v3, align 2
+  %val = zext <3 x i16> %load to <3 x i32>
+  %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %gep_out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_global_v3i32_scalar:
+; GCN-DAG: s_load_dwordx2 s[{{[0-9:]+}}], s[{{[0-9:]+}}], 0x0
+; GCN-DAG: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], 0x{{2|8}}
+; GCN-NOT: load_dwordx3
+define amdgpu_kernel void @load_global_v3i32_scalar(float addrspace(1)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) {
+  %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 0
+  %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x i32> addrspace(1)*
+  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep_in_v3, align 4
+  store <3 x i32> %load, <3 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #1 = { nounwind readnone speculatable }