Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -762,6 +762,19 @@ [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent]>; +// llvm.amdgcn.update.dpp.i32 +// Should be equivalent to: +// v_mov_b32 +// v_mov_b32 +// The wqm_ctrl argument decides whether to enable WQM or WWM: +// 0 - use normal EXEC +// 1 - WQM (Whole Quad Mode) +// 2 - WWM (Whole Wavefront Mode) +def int_amdgcn_update_dpp : + Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_s_dcache_wb : GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">, Intrinsic<[], [], []>; Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -658,6 +658,14 @@ (as_i1imm $bound_ctrl)) >; +def : Pat < + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl, imm:$wqm_ctrl)), + (V_MOV_B32_dpp (as_i8imm $wqm_ctrl), $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; + def : Pat< (i32 (anyext i16:$src)), (COPY $src) Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s + +; VI-LABEL: {{^}}dpp_test: +; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI: v_mov_b32_e32 v1, s{{[0-9]+}} +; VI: s_nop 1 +; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] +define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1, i32 0) #0 + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +; VI-LABEL: {{^}}dpp_wqm_test: +; VI: s_wqm_b64 exec, exec +; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] +define amdgpu_kernel void @dpp_wqm_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1, i32 1) #0 + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +; VI-LABEL: {{^}}dpp_wwm_test: +; VI: s_mov_b64 exec, -1 +; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI: v_mov_b32_e32 v1, s{{[0-9]+}} +; VI: s_nop 1 +; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] +define amdgpu_kernel void @dpp_wwm_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1, i32 2) #0 + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1, i32) #0 + +attributes #0 = { nounwind readnone convergent }