Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -182,6 +182,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts") +TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts") //===----------------------------------------------------------------------===// // GFX9+ only builtins. Index: clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -7,6 +7,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned long ulong; +typedef unsigned int uint; // CHECK-LABEL: @test_div_fixup_f16 // CHECK: call half @llvm.amdgcn.div.fixup.f16 @@ -137,3 +138,10 @@ { *out = __builtin_amdgcn_s_memtime(); } + +// CHECK-LABEL: @test_perm +// CHECK: call i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s) +void test_perm(global uint* out, uint a, uint b, uint s) +{ + *out = __builtin_amdgcn_perm(a, b, s); +} Index: clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl =================================================================== --- clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl +++ clang/test/SemaOpenCL/builtins-amdgcn-error-vi.cl @@ -2,7 +2,8 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu tahiti -verify -S -o - %s // RUN: %clang_cc1 -triple amdgcn-- -target-cpu hawaii -verify -S -o - %s -void test_vi_s_dcache_wb() +void test_vi_builtins() { __builtin_amdgcn_s_dcache_wb(); // expected-error {{'__builtin_amdgcn_s_dcache_wb' needs target feature gfx8-insts}} + (void)__builtin_amdgcn_perm(1, 2, 3); // expected-error {{'__builtin_amdgcn_perm' needs target feature gfx8-insts}} } Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1716,6 +1716,12 @@ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]>; +// llvm.amdgcn.ds.bpermute +def int_amdgcn_perm : + GCCBuiltin<"__builtin_amdgcn_perm">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + //===----------------------------------------------------------------------===// // GFX10 Intrinsics //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3949,6 +3949,7 @@ case Intrinsic::amdgcn_cvt_pk_u8_f32: case Intrinsic::amdgcn_alignbit: case Intrinsic::amdgcn_alignbyte: + case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_sdot2: case Intrinsic::amdgcn_udot2: Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6695,6 +6695,9 @@ case Intrinsic::amdgcn_alignbit: return DAG.getNode(ISD::FSHR, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_perm: + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_reloc_constant: { Module *M = const_cast(MF.getFunction().getParent()); const MDNode *Metadata = cast(Op.getOperand(1))->getMD(); Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -417,10 +417,15 @@ let SchedRW = [Write32Bit] in { let SubtargetPredicate = isGFX8Plus in { -defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile, AMDGPUperm>; +defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile, int_amdgcn_perm>; } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write32Bit] +def : GCNPat< + (AMDGPUperm i32:$src0, i32:$src1, i32:$src2), + (V_PERM_B32_e64 $src0, $src1, $src2) +>; + let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.amdgcn.perm(i32, i32, i32) #0 + +; GCN-LABEL: {{^}}v_perm_b32: +; GCN: v_perm_b32 {{[vs][0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}} +define amdgpu_kernel void @v_perm_b32(i32 addrspace(1)* %out, i32 %src1, i32 %src2, i32 %src3) #1 { + %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 %src3) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }