Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -735,6 +735,12 @@ [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent]>; +// llvm.amdgcn.mov.dpp.wqm.i32 +def int_amdgcn_mov_dpp_wqm : + Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i1_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_s_dcache_wb : GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">, Intrinsic<[], [], []>; Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -642,6 +642,12 @@ let SubtargetPredicate = isVI; } +def V_MOV_B32_dpp_wqm : VOP1_DPP<0x1, !cast("V_MOV_B32_e32")> { + let WQM = 1; + let isCodeGenOnly = 1; +} + + // This is a pseudo variant of the v_movreld_b32 instruction in which the // vector operand appears only twice, once as def and once as use. Using this // pseudo avoids problems with the Two Address instructions pass. @@ -671,6 +677,13 @@ (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; +def : Pat < + (i32 (int_amdgcn_mov_dpp_wqm i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl)), + (V_MOV_B32_dpp_wqm $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), + (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) +>; + def : Pat< (i32 (anyext i16:$src)), Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.wqm.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.wqm.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s + +; FIXME: The register allocator / scheduler should be able to avoid these hazards. + +; VI-LABEL: {{^}}dpp_test: +; VI: s_wqm_b64 exec, exec +; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI: exp mrt0 v0, v0, v0, v0 done vm +define amdgpu_ps void @dpp_test(i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %data = bitcast i32 %tmp0 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1 + ret void +} + +; VI-LABEL: {{^}}dpp_wait_states: +; VI: s_wqm_b64 exec, exec +; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI: s_nop 1 +; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI: exp mrt0 [[VGPR1]], v0, v0, v0 done vm +define amdgpu_ps void @dpp_wait_states(i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 + %data = bitcast i32 %tmp1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1 + ret void +} + +; VI-LABEL: {{^}}dpp_first_in_bb: +; VI: s_wqm_b64 exec, exec +; VI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, s{{\[[0-9]+:[0-9]+\]}} +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x02,0x01,0x08,0x11] +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI: exp mrt0 v{{[0-9]+}}, v0, v0, v0 done vm +define amdgpu_ps void @dpp_first_in_bb(i32 %idx, i32 %idx2, float %cond, float %a, float %b) { + %cmp = fcmp oeq float %cond, 0.0 + br i1 %cmp, label %if, label %else + +if: + %out_val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %if_val = fadd float %a, %out_val + br label %endif + +else: + %in_val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx2, i32 0, i1 0, i1 0) + %else_val = fadd float %b, %in_val + br label %endif + +endif: + %val = phi float [%if_val, %if], [%else_val, %else] + %val_i32 = bitcast float %val to i32 + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 + %tmp2 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0 + %tmp_float = bitcast i32 %tmp2 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp_float, float undef, float undef, float undef, i1 true, i1 true) #1 + ret void +} + +; VI-LABEL: {{^}}llpc.dpdx.f32: +; VI: s_wqm_b64 exec, exec +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 ; encoding: [0xfa,0x02,0x02,0x7e,0x00,0x00,0x08,0xff] +; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x55,0x08,0xff] +define amdgpu_ps float @llpc.dpdx.f32(float %p) { + ; Broadcast channel 0 to whole quad + %p.i32 = bitcast float %p to i32 + %p.dpp.i32 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %p.i32, i32 0, i32 15, i32 15, i1 1) + %p.dpp = bitcast i32 %p.dpp.i32 to float + ; Calculate the delta value + %dp.x = fsub float %p, %p.dpp + %dp = fmul float %dp.x, 0.5 ; <== Added by me + ; Braodcast channel 1 to whole quad (85 = 0x55) + %dp.i32 = bitcast float %dp to i32 + %dpdx.i32 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %dp.i32, i32 85, i32 15, i32 15, i1 1) + %dpdx = bitcast i32 %dpdx.i32 to float + ret float %dpdx +} + +declare i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32, i32, i32, i32, i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone }