Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -538,7 +538,7 @@ def int_amdgcn_mqsad_u32_u8 : GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_amdgcn_cvt_pk_u8_f32 : GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -255,6 +255,10 @@ return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID); } + bool isSCSrc128() const { + return isInlinableImm() || isRegClass(AMDGPU::SReg_128RegClassID); + } + bool isSSrc32() const { return isImm() || isSCSrc32() || isExpr(); } @@ -273,6 +277,10 @@ return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID); } + bool isVCSrc128() const { + return isInlinableImm() || isRegClass(AMDGPU::VS_128RegClassID); + } + bool isVSrc32() const { return isImm() || isVCSrc32(); } @@ -286,6 +294,11 @@ return isImm() || isVCSrc64(); } + bool isVSrc128() const { + return isImm() || isVCSrc128(); + } + + bool isMem() const override { return false; } Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -58,7 +58,7 @@ VOP_I64_I64_I32_I64, int_amdgcn_qsad_pk_u16_u8>; defm V_MQSAD_U32_U8 : VOP3Inst , "v_mqsad_u32_u8", - VOP_I32_I32_I32_I32, int_amdgcn_mqsad_u32_u8>; + VOP_V4I32_I64_I32_V4I32, int_amdgcn_mqsad_u32_u8>; let isCommutable = 1 in { defm V_MAD_U64_U32 : VOP3Inst , "v_mad_u64_u32", Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -57,6 +57,7 @@ MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -62,6 +62,7 @@ DECODE_OPERAND(VGPR_32) DECODE_OPERAND(VS_32) DECODE_OPERAND(VS_64) +DECODE_OPERAND(VS_128) DECODE_OPERAND(VReg_64) DECODE_OPERAND(VReg_96) @@ -234,6 +235,10 @@ return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1096,34 +1096,38 @@ // instructions for the given VT. class getVALUDstForVT { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand, - !if(!eq(VT.Size, 64), VOPDstOperand, - !if(!eq(VT.Size, 16), VOPDstOperand, - VOPDstOperand))); // else VT == i1 + !if(!eq(VT.Size, 128), VOPDstOperand, + !if(!eq(VT.Size, 64), VOPDstOperand, + !if(!eq(VT.Size, 16), VOPDstOperand, + VOPDstOperand)))); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT { - RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); + RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128, + !if(!eq(VT.Size, 64), VSrc_64, VSrc_32)); } // Returns the vreg register class to use for source operand given VT class getVregSrcForVT { - RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); + RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, + !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } - // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT { RegisterOperand ret = + !if(!eq(VT.Size, 128), + VCSrc_128, !if(!eq(VT.Size, 64), VCSrc_64, - !if(!eq(VT.Value, i1.Value), - SCSrc_64, - VCSrc_32 - ) - ); + !if(!eq(VT.Value, i1.Value), + SCSrc_64, + VCSrc_32) + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. @@ -1607,6 +1611,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; +def VOP_V4I32_I64_I32_V4I32: VOPProfile <[v4i32, i64, i32, v4i32]>; // This class is used only with VOPC instructions. Use $sdst for out operand class SIInstAlias : Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -345,6 +345,10 @@ let CopyCost = 2; } +def VS_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VReg_128, SReg_128)> { + let isAllocatable = 0; +} + //===----------------------------------------------------------------------===// // Register operands //===----------------------------------------------------------------------===// @@ -400,6 +404,10 @@ let ParserMatchClass = RegImmMatcher<"VSrc64">; } +def VSrc_128 : RegImmOperand { + let ParserMatchClass = RegImmMatcher<"VSrc128">; +} + //===----------------------------------------------------------------------===// // VSrc_* Operands with an VGPR //===----------------------------------------------------------------------===// @@ -422,3 +430,8 @@ def VCSrc_64 : RegInlineOperand { let ParserMatchClass = RegImmMatcher<"VCSrc64">; } + +def VCSrc_128 : RegInlineOperand { + let ParserMatchClass = RegImmMatcher<"VCSrc128">; +} + Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll @@ -1,21 +1,61 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare i32 @llvm.amdgcn.mqsad.u32.u8(i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0 -; GCN-LABEL: {{^}}v_mqsad_u32_u8: -; GCN: v_mqsad_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mqsad_u32_u8(i32 addrspace(1)* %out, i32 %src) { - %result= call i32 @llvm.amdgcn.mqsad.u32.u8(i32 %src, i32 100, i32 100) #0 - store i32 %result, i32 addrspace(1)* %out, align 4 +; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant: +; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) { + %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> ) #0 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate: -; GCN: v_mqsad_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mqsad_u32_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { - %result= call i32 @llvm.amdgcn.mqsad.u32.u8(i32 %src, i32 %a, i32 %b) #0 - store i32 %result, i32 addrspace(1)* %out, align 4 +; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) { + %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate: +; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { + store <4 x i32> , <4 x i32> *undef + %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> undef) #0 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate: +; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { + %temp0 = insertelement <4 x float> undef, float 10.0, i32 0 + %temp1 = insertelement <4 x float> %temp0, float 20.0, i32 1 + %temp2 = insertelement <4 x float> %temp1, float 30.0, i32 2 + %temp3 = insertelement <4 x float> %temp2, float 40.0, i32 3 + %in = fptoui <4 x float> %temp3 to <4 x i32> + %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr: +; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %load) { + %load.0 = extractelement <4 x i32> %load, i32 0 + %load.1 = extractelement <4 x i32> %load, i32 1 + %load.2 = extractelement <4 x i32> %load, i32 2 + %load.3 = extractelement <4 x i32> %load, i32 3 + + %temp0 = insertelement <4 x i32> undef, i32 %load.0, i32 0 + %temp1 = insertelement <4 x i32> %temp0, i32 %load.1, i32 1 + %temp2 = insertelement <4 x i32> %temp1, i32 %load.2, i32 2 + %temp3 = insertelement <4 x i32> %temp2, i32 %load.3, i32 3 + + %result= call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %temp3) #0 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void }