diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -182,6 +182,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts") +TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts") //===----------------------------------------------------------------------===// // GFX9+ only builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -7,6 +7,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned long ulong; +typedef unsigned int uint; // CHECK-LABEL: @test_div_fixup_f16 // CHECK: call half @llvm.amdgcn.div.fixup.f16 @@ -137,3 +138,10 @@ { *out = __builtin_amdgcn_s_memtime(); } + +// CHECK-LABEL: @test_perm +// CHECK: call i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s) +void test_perm(global uint* out, uint a, uint b, uint s) +{ + *out = __builtin_amdgcn_perm(a, b, s); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl @@ -2,7 +2,8 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu tahiti -verify -S -o - %s // RUN: %clang_cc1 -triple amdgcn-- -target-cpu hawaii -verify -S -o - %s -void test_vi_s_dcache_wb() +void test_vi_builtins() { __builtin_amdgcn_s_dcache_wb(); // expected-error {{'__builtin_amdgcn_s_dcache_wb' needs target feature gfx8-insts}} + (void)__builtin_amdgcn_perm(1, 2, 3); // expected-error {{'__builtin_amdgcn_perm' needs target feature gfx8-insts}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1716,6 +1716,12 @@ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]>; +// llvm.amdgcn.perm +def int_amdgcn_perm : + GCCBuiltin<"__builtin_amdgcn_perm">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + //===----------------------------------------------------------------------===// // GFX10 Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -313,7 +313,7 @@ SDTCisInt<4>]>, []>; -def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ @@ -463,3 +463,7 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc), [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc), (AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>; + +def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_perm node:$src0, node:$src1, node:$src2), + (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3949,6 +3949,7 @@ case Intrinsic::amdgcn_cvt_pk_u8_f32: case Intrinsic::amdgcn_alignbit: case Intrinsic::amdgcn_alignbyte: + case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_sdot2: case Intrinsic::amdgcn_udot2: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6695,6 +6695,9 @@ case Intrinsic::amdgcn_alignbit: return DAG.getNode(ISD::FSHR, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_perm: + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { Module *M = const_cast(MF.getFunction().getParent()); const MDNode *Metadata = cast(Op.getOperand(1))->getMD(); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.amdgcn.perm(i32, i32, i32) #0 + +; GCN-LABEL: {{^}}v_perm_b32_v_v_v: +; GCN: v_perm_b32 v{{[0-9]+}}, v0, v1, v2 +define amdgpu_ps void @v_perm_b32_v_v_v(i32 %src1, i32 %src2, i32 %src3, i32 addrspace(1)* %out) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 %src3) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_perm_b32_v_v_c: +; GCN: v_perm_b32 v{{[0-9]+}}, v0, v1, {{[vs][0-9]+}} +define amdgpu_ps void @v_perm_b32_v_v_c(i32 %src1, i32 %src2, i32 addrspace(1)* %out) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_perm_b32_s_v_c: +; GCN: v_perm_b32 v{{[0-9]+}}, s0, v0, v{{[0-9]+}} +define amdgpu_ps void @v_perm_b32_s_v_c(i32 inreg %src1, i32 %src2, i32 addrspace(1)* %out) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_perm_b32_s_s_c: +; GCN: v_perm_b32 v{{[0-9]+}}, s0, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_ps void @v_perm_b32_s_s_c(i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_perm_b32_v_s_i: +; GCN: v_perm_b32 v{{[0-9]+}}, v0, s0, 1 +define amdgpu_ps void @v_perm_b32_v_s_i(i32 %src1, i32 inreg %src2, i32 addrspace(1)* %out) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 1) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }