Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1191,7 +1191,7 @@ // Deep learning intrinsics. //===----------------------------------------------------------------------===// -// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c) +// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c def int_amdgcn_fdot2 : GCCBuiltin<"__builtin_amdgcn_fdot2">, @@ -1200,12 +1200,13 @@ [ llvm_v2f16_ty, // %a llvm_v2f16_ty, // %b - llvm_float_ty // %c + llvm_float_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c) +// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c def int_amdgcn_sdot2 : GCCBuiltin<"__builtin_amdgcn_sdot2">, @@ -1214,12 +1215,13 @@ [ llvm_v2i16_ty, // %a llvm_v2i16_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c) +// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c def int_amdgcn_udot2 : GCCBuiltin<"__builtin_amdgcn_udot2">, @@ -1228,12 +1230,13 @@ [ llvm_v2i16_ty, // %a llvm_v2i16_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c) +// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c def int_amdgcn_sdot4 : GCCBuiltin<"__builtin_amdgcn_sdot4">, @@ -1242,12 +1245,13 @@ [ llvm_i32_ty, // %a llvm_i32_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c) +// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c def int_amdgcn_udot4 : GCCBuiltin<"__builtin_amdgcn_udot4">, @@ -1256,12 +1260,13 @@ [ llvm_i32_ty, // %a llvm_i32_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c) +// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c def int_amdgcn_sdot8 : @@ -1271,12 +1276,13 @@ [ llvm_i32_ty, // %a llvm_i32_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; -// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c) +// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c def int_amdgcn_udot8 : @@ -1286,7 +1292,8 @@ [ llvm_i32_ty, // %a llvm_i32_ty, // %b - llvm_i32_ty // %c + llvm_i32_ty, // %c + llvm_i1_ty // %clamp ], [IntrNoMem, IntrSpeculatable] >; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,8 +342,9 @@ def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", - SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, - SDTCisFP<0>, SDTCisVec<1>]>, + SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisFP<0>, SDTCisVec<1>, + SDTCisInt<4>]>, []>; def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5010,7 +5010,8 @@ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fdot2: return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -7613,8 +7614,10 @@ return SDValue(); if ((Vec1 == Vec3 && Vec2 == Vec4) || - (Vec1 == Vec4 && Vec2 == Vec3)) - return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + (Vec1 == Vec4 && Vec2 == Vec3)) { + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, + DAG.getTargetConstant(0, SL, MVT::i1)); + } } return SDValue(); } Index: llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,13 +167,30 @@ let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, AMDGPUfdot2>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2>; -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile, int_amdgcn_sdot4>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile, int_amdgcn_udot4>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile, int_amdgcn_sdot8>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile, int_amdgcn_udot8>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile>; +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile>; +def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile>; +def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile>; + +multiclass DotPats { + def : GCNPat < + (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), + (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), + (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), + (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; +} + +defm : DotPats; +defm : DotPats; +defm : DotPats; +defm : DotPats; +defm : DotPats; +defm : DotPats; +defm : DotPats; } // End SubtargetPredicate = HasDLInsts Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906 -declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c) +declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) -; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2 -; GFX906: v_dot2_f32_f16 -define amdgpu_kernel void @test_llvm_amdgcn_fdot2( +; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp +; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp( float addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -13,7 +13,23 @@ %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %b.val = load <2 x half>, <2 x half> addrspace(1)* %b %c.val = load float, float addrspace(1)* %c - %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val) + %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1) + store float %r.val, float addrspace(1)* %r + ret void +} + +; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp +; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp( + float addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + float addrspace(1)* %c) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load float, float addrspace(1)* %c + %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0) store float %r.val, float addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) +declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2 -; GFX906: v_dot2_i32_i16 -define amdgpu_kernel void @test_llvm_amdgcn_sdot2( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp +; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp( i32 addrspace(1)* %r, <2 x i16> addrspace(1)* %a, <2 x i16> addrspace(1)* %b, @@ -13,7 +13,23 @@ %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp +; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4 -; GFX906: v_dot4_i32_i8 -define amdgpu_kernel void @test_llvm_amdgcn_sdot4( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp +; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp( i32 addrspace(1)* %r, <4 x i8> addrspace(1)* %a, <4 x i8> addrspace(1)* %b, @@ -15,7 +15,25 @@ %a.val.cast = bitcast <4 x i8> %a.val to i32 %b.val.cast = bitcast <4 x i8> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp +; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8 -; GFX906: v_dot8_i32_i4 -define amdgpu_kernel void @test_llvm_amdgcn_sdot8( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, <8 x i4> addrspace(1)* %b, @@ -15,7 +15,25 @@ %a.val.cast = bitcast <8 x i4> %a.val to i32 %b.val.cast = bitcast <8 x i4> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) +declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2 -; GFX906: v_dot2_u32_u16 -define amdgpu_kernel void @test_llvm_amdgcn_udot2( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp +; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( i32 addrspace(1)* %r, <2 x i16> addrspace(1)* %a, <2 x i16> addrspace(1)* %b, @@ -13,7 +13,23 @@ %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp +; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4 -; GFX906: v_dot4_u32_u8 -define amdgpu_kernel void @test_llvm_amdgcn_udot4( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_clamp +; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot4_clamp( i32 addrspace(1)* %r, <4 x i8> addrspace(1)* %a, <4 x i8> addrspace(1)* %b, @@ -15,7 +15,25 @@ %a.val.cast = bitcast <4 x i8> %a.val to i32 %b.val.cast = bitcast <4 x i8> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_no_clamp +; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot4_no_clamp( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8 -; GFX906: v_dot8_u32_u4 -define amdgpu_kernel void @test_llvm_amdgcn_udot8( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_clamp +; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot8_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, <8 x i4> addrspace(1)* %b, @@ -15,7 +15,25 @@ %a.val.cast = bitcast <8 x i4> %a.val to i32 %b.val.cast = bitcast <8 x i4> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_no_clamp +; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot8_no_clamp( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void }