Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -538,7 +538,7 @@
 
 def int_amdgcn_mqsad_u32_u8 :
   GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
 def int_amdgcn_cvt_pk_u8_f32 :
   GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
===================================================================
--- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -255,6 +255,10 @@
     return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID);
   }
 
+  bool isSCSrc128() const {
+    return isInlinableImm() || isRegClass(AMDGPU::SReg_128RegClassID);
+  }
+
   bool isSSrc32() const {
     return isImm() || isSCSrc32() || isExpr();
   }
@@ -273,6 +277,10 @@
     return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID);
   }
 
+  bool isVCSrc128() const {
+    return isInlinableImm() || isRegClass(AMDGPU::VS_128RegClassID);
+  }
+
   bool isVSrc32() const {
     return isImm() || isVCSrc32();
   }
@@ -286,6 +294,11 @@
     return isImm() || isVCSrc64();
   }
 
+  bool isVSrc128() const {
+    return isImm() || isVCSrc128();
+  }
+
+
   bool isMem() const override {
     return false;
   }
Index: lib/Target/AMDGPU/CIInstructions.td
===================================================================
--- lib/Target/AMDGPU/CIInstructions.td
+++ lib/Target/AMDGPU/CIInstructions.td
@@ -58,7 +58,7 @@
   VOP_I64_I64_I32_I64, int_amdgcn_qsad_pk_u16_u8>;
 
 defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x174, 0x1e7>, "v_mqsad_u32_u8",
-  VOP_I32_I32_I32_I32, int_amdgcn_mqsad_u32_u8>;
+  VOP_V4I32_I64_I32_V4I32, int_amdgcn_mqsad_u32_u8>;
 
 let isCommutable = 1 in {
 defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
===================================================================
--- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -57,6 +57,7 @@
     MCOperand decodeOperand_VGPR_32(unsigned Val) const;
     MCOperand decodeOperand_VS_32(unsigned Val) const;
     MCOperand decodeOperand_VS_64(unsigned Val) const;
+    MCOperand decodeOperand_VS_128(unsigned Val) const;
 
     MCOperand decodeOperand_VReg_64(unsigned Val) const;
     MCOperand decodeOperand_VReg_96(unsigned Val) const;
Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
===================================================================
--- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -62,6 +62,7 @@
 DECODE_OPERAND(VGPR_32)
 DECODE_OPERAND(VS_32)
 DECODE_OPERAND(VS_64)
+DECODE_OPERAND(VS_128)
 
 DECODE_OPERAND(VReg_64)
 DECODE_OPERAND(VReg_96)
@@ -234,6 +235,10 @@
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -1096,34 +1096,38 @@
 // instructions for the given VT.
 class getVALUDstForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
-                          !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
-                            !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
-                            VOPDstOperand<SReg_64>))); // else VT == i1
+                        !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
+                        !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+                        !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+                        VOPDstOperand<SReg_64>)))); // else VT == i1
 }
 
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
+  RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128,
+                        !if(!eq(VT.Size, 64), VSrc_64, VSrc_32));
 }
 
 // Returns the vreg register class to use for source operand given VT
 class getVregSrcForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
+  RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
+                      !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
 }
 
-
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
   RegisterOperand ret =
+  !if(!eq(VT.Size, 128),
+      VCSrc_128,
   !if(!eq(VT.Size, 64),
       VCSrc_64,
-      !if(!eq(VT.Value, i1.Value),
-          SCSrc_64,
-          VCSrc_32
-       )
-    );
+  !if(!eq(VT.Value, i1.Value),
+      SCSrc_64,
+      VCSrc_32)
+     )
+  );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -1607,6 +1611,7 @@
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
 def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
+def VOP_V4I32_I64_I32_V4I32: VOPProfile <[v4i32, i64, i32, v4i32]>;
 
 // This class is used only with VOPC instructions. Use $sdst for out operand
 class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
Index: lib/Target/AMDGPU/SIRegisterInfo.td
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.td
+++ lib/Target/AMDGPU/SIRegisterInfo.td
@@ -345,6 +345,10 @@
   let CopyCost = 2;
 }
 
+def VS_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VReg_128, SReg_128)> {
+  let isAllocatable = 0;
+}
+
 //===----------------------------------------------------------------------===//
 //  Register operands
 //===----------------------------------------------------------------------===//
@@ -400,6 +404,10 @@
   let ParserMatchClass = RegImmMatcher<"VSrc64">;
 }
 
+def VSrc_128 : RegImmOperand<VS_128> {
+  let ParserMatchClass = RegImmMatcher<"VSrc128">;
+}
+
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an VGPR
 //===----------------------------------------------------------------------===//
@@ -422,3 +430,8 @@
 def VCSrc_64 : RegInlineOperand<VS_64> {
   let ParserMatchClass = RegImmMatcher<"VCSrc64">;
 }
+
+def VCSrc_128 : RegInlineOperand<VS_128> {
+  let ParserMatchClass = RegImmMatcher<"VCSrc128">;
+}
+
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -1,21 +1,61 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare i32 @llvm.amdgcn.mqsad.u32.u8(i32, i32, i32) #0
+declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
-; GCN-LABEL: {{^}}v_mqsad_u32_u8:
-; GCN: v_mqsad_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mqsad_u32_u8(i32 addrspace(1)* %out, i32 %src) {
-  %result= call i32 @llvm.amdgcn.mqsad.u32.u8(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
+; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
+  %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
-; GCN: v_mqsad_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mqsad_u32_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
-  %result= call i32 @llvm.amdgcn.mqsad.u32.u8(i32 %src, i32 %a, i32 %b) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+  %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
+; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  store <4 x i32> <i32 10, i32 20, i32 30, i32 40>, <4 x i32> *undef
+  %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> undef) #0
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
+; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  %temp0 = insertelement <4 x float> undef, float 10.0, i32 0
+  %temp1 = insertelement <4 x float> %temp0, float 20.0, i32 1
+  %temp2 = insertelement <4 x float> %temp1, float 30.0, i32 2
+  %temp3 = insertelement <4 x float> %temp2, float 40.0, i32 3
+  %in = fptoui <4 x float> %temp3 to <4 x i32>
+  %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
+; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %load) {
+  %load.0 = extractelement <4 x i32> %load, i32 0
+  %load.1 = extractelement <4 x i32> %load, i32 1
+  %load.2 = extractelement <4 x i32> %load, i32 2
+  %load.3 = extractelement <4 x i32> %load, i32 3
+
+  %temp0 = insertelement <4 x i32> undef, i32 %load.0, i32 0
+  %temp1 = insertelement <4 x i32> %temp0, i32 %load.1, i32 1
+  %temp2 = insertelement <4 x i32> %temp1, i32 %load.2, i32 2
+  %temp3 = insertelement <4 x i32> %temp2, i32 %load.3, i32 3
+
+  %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %temp3) #0
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }