diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,8 @@ //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlane16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlanex16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts") //===----------------------------------------------------------------------===// diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -18,6 +18,19 @@ *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } +// CHECK-LABEL: @test_permlane16_f32( +// CHECK: call float @llvm.amdgcn.permlane16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 0, 0); +} + +// CHECK-LABEL: @test_permlanex16_f32( +// CHECK: call float @llvm.amdgcn.permlanex16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 0, 0); +} + + // CHECK-LABEL: @test_mov_dpp8( // CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) void test_mov_dpp8(global uint* out, uint a) { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl @@ -13,6 +13,18 @@ *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16' must be a constant integer}} } + +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} +} + +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} +} + + void test_mov_dpp8(global uint* out, uint a, uint b) { *out = __builtin_amdgcn_mov_dpp8(a, b); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1950,6 +1950,24 @@ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlane16.f32 +def int_amdgcn_permlane16_f32 : ClangBuiltin<"__builtin_amdgcn_permlane16_f32">, + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, + ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlanex16.f32 +def int_amdgcn_permlanex16_f32 : ClangBuiltin<"__builtin_amdgcn_permlanex16_f32">, + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, + ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + + + + // llvm.amdgcn.mov.dpp8.i32 // is a 32-bit constant whose high 8 bits must be zero which selects // the lanes to read from. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -892,7 +892,9 @@ return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); } case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane16_f32: + case Intrinsic::amdgcn_permlanex16_f32: { // Discard vdst_in if it's not going to be read. Value *VDstIn = II.getArgOperand(0); if (isa(VDstIn)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2990,7 +2990,9 @@ applyDefaultMapping(OpdMapper); return; case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane16_f32: + case Intrinsic::amdgcn_permlanex16_f32: { // Doing a waterfall loop over these wouldn't make any sense. substituteSimpleCopyRegs(OpdMapper, 2); substituteSimpleCopyRegs(OpdMapper, 3); @@ -4367,7 +4369,9 @@ break; } case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane16_f32: + case Intrinsic::amdgcn_permlanex16_f32: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -303,6 +303,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -663,7 +663,9 @@ let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in defm : IMAD32_Pats; -def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { + + +class VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, IntOpSelMods:$src1_modifiers, SSrc_b32:$src1, IntOpSelMods:$src2_modifiers, SSrc_b32:$src2, @@ -679,9 +681,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, GISDNodeXFormEquiv; -class PermlanePat : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, + (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) @@ -695,12 +697,17 @@ def : ThreeOp_i32_Pats; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { - defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; - defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANE16_F32_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANEX16_F32_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in - def : PermlanePat; - def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>;