diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,8 @@ //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlane16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlanex16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts") //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17358,6 +17358,32 @@ return Builder.CreateCall(F, Args); } + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlanex16: + case AMDGPU::BI__builtin_amdgcn_permlane16_f32: + case AMDGPU::BI__builtin_amdgcn_permlanex16_f32: { + Intrinsic::ID Intrin; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlane16_f32: + Intrin = Intrinsic::amdgcn_permlane16; + break; + case AMDGPU::BI__builtin_amdgcn_permlanex16: + case AMDGPU::BI__builtin_amdgcn_permlanex16_f32: + Intrin = Intrinsic::amdgcn_permlanex16; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); + llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); + + llvm::Function *F = CGM.getIntrinsic(Intrin, Src1->getType()); + return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); + } + // amdgcn workitem case AMDGPU::BI__builtin_amdgcn_workitem_id_x: return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -7,17 +7,30 @@ typedef unsigned long ulong; // CHECK-LABEL: @test_permlane16( -// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0); } // CHECK-LABEL: @test_permlanex16( -// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } +// CHECK-LABEL: @test_permlane16_f32( +// CHECK: call float @llvm.amdgcn.permlane16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 0, 0); +} + +// CHECK-LABEL: @test_permlanex16_f32( +// CHECK: call float @llvm.amdgcn.permlanex16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 0, 0); +} + + // CHECK-LABEL: @test_mov_dpp8( // CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) void test_mov_dpp8(global uint* out, uint a) { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl @@ -13,6 +13,18 @@ *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16' must be a constant integer}} } + +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} +} + +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} +} + + void test_mov_dpp8(global uint* out, uint a, uint b) { *out = __builtin_amdgcn_mov_dpp8(a, b); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1937,16 +1937,16 @@ //===----------------------------------------------------------------------===// // llvm.amdgcn.permlane16 -def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], +def int_amdgcn_permlane16 : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlanex16 -def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], +def int_amdgcn_permlanex16 : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -305,7 +305,7 @@ V = buildNonAtomicBinOp( B, Op, V, B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); if (ST->isWave32()) @@ -362,7 +362,7 @@ // 48..63). assert(ST->hasPermLaneX16()); Value *const PermX = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp( B, Op, V, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -663,7 +663,9 @@ let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in defm : IMAD32_Pats; -def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { + + +class VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, IntOpSelMods:$src1_modifiers, SSrc_b32:$src1, IntOpSelMods:$src2_modifiers, SSrc_b32:$src2, @@ -679,11 +681,11 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, GISDNodeXFormEquiv; -class PermlanePat : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, - timm:$fi, timm:$bc), - (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), + (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc)), + (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; @@ -695,12 +697,17 @@ def : ThreeOp_i32_Pats; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { - defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; - defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANE16_F32_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; + defm V_PERMLANEX16_F32_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in - def : PermlanePat; - def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + def : PermlanePat; + defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>; diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -8,20 +8,35 @@ ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out ret void } + +; CHECK: DIVERGENT: %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +define amdgpu_kernel void @v_permlane16_b32_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #0 { + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + store float %v, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +define amdgpu_kernel void @v_permlanex16_b32_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #0 { + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + store float %v, ptr addrspace(1) %out + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 define amdgpu_kernel void @update_dpp(ptr addrspace(1) %out, i32 %in1, i32 %in2) #0 { %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 @@ -99,6 +114,8 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.permlanex16.f32(float, float, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -66,7 +66,7 @@ define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp { ; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR13:[0-9]+]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone @@ -107,7 +107,7 @@ define half @test_constant_fold_sqrt_f16_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR14:[0-9]+]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone @@ -116,7 +116,7 @@ define float @test_constant_fold_sqrt_f32_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone @@ -125,7 +125,7 @@ define double @test_constant_fold_sqrt_f64_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone @@ -134,7 +134,7 @@ define half @test_constant_fold_sqrt_f16_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR14]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone @@ -143,7 +143,7 @@ define float @test_constant_fold_sqrt_f32_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone @@ -152,7 +152,7 @@ define double @test_constant_fold_sqrt_f64_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone @@ -644,7 +644,7 @@ define i1 @test_class_isnan_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_isnan_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR15:[0-9]+]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp @@ -662,7 +662,7 @@ define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_is_p0_n0_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR15]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp @@ -1275,8 +1275,8 @@ define i32 @ubfe_offset_0_width_3(i32 %src) { ; CHECK-LABEL: @ubfe_offset_0_width_3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7 -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[BFE:%.*]] = and i32 [[SRC:%.*]], 7 +; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3) ret i32 %bfe @@ -1793,7 +1793,7 @@ define i64 @icmp_constant_inputs_true() { ; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR16:[0-9]+]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34) @@ -2500,7 +2500,7 @@ define i64 @fcmp_constant_inputs_true() { ; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4) @@ -2542,7 +2542,7 @@ define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) @@ -2568,7 +2568,7 @@ define i32 @ballot_one_32() { ; CHECK-LABEL: @ballot_one_32( -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR16]] ; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 1) @@ -2821,34 +2821,34 @@ define amdgpu_kernel void @update_dpp_no_combine(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; CHECK-LABEL: @update_dpp_no_combine( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1:%.*]], i32 [[IN2:%.*]], i32 1, i32 1, i32 1, i1 false) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: [[VAR0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1:%.*]], i32 [[IN2:%.*]], i32 1, i32 1, i32 1, i1 false) +; CHECK-NEXT: store i32 [[VAR0]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) - store i32 %tmp0, ptr addrspace(1) %out + %var0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) + store i32 %var0, ptr addrspace(1) %out ret void } define amdgpu_kernel void @update_dpp_drop_old(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; CHECK-LABEL: @update_dpp_drop_old( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2:%.*]], i32 3, i32 15, i32 15, i1 true) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: [[VAR0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2:%.*]], i32 3, i32 15, i32 15, i1 true) +; CHECK-NEXT: store i32 [[VAR0]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1) - store i32 %tmp0, ptr addrspace(1) %out + %var0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1) + store i32 %var0, ptr addrspace(1) %out ret void } define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1) { ; CHECK-LABEL: @update_dpp_undef_old( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1:%.*]], i32 4, i32 15, i32 15, i1 true) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: [[VAR0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1:%.*]], i32 4, i32 15, i32 15, i1 true) +; CHECK-NEXT: store i32 [[VAR0]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1) - store i32 %tmp0, ptr addrspace(1) %out + %var0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1) + store i32 %var0, ptr addrspace(1) %out ret void } @@ -2861,7 +2861,7 @@ define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2872,7 +2872,7 @@ define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2883,7 +2883,7 @@ define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2892,6 +2892,42 @@ ret void } + +declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1 immarg, i1 immarg) + +define amdgpu_kernel void @permlane16_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlane16.f32(float 1.234500e+04, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlane16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane16_bound_ctrl_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16_bound_ctrl_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlane16.f32(float undef, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlane16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlane16.f32(float undef, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlane16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.permlanex16 ; -------------------------------------------------------------------- @@ -2900,7 +2936,7 @@ define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2911,7 +2947,7 @@ define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2922,7 +2958,7 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -2931,6 +2967,41 @@ ret void } +declare float @llvm.amdgcn.permlanex16.f32(float, float, i32, i32, i1 immarg, i1 immarg) + +define amdgpu_kernel void @permlanex16_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlanex16.f32(float 1.234500e+04, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlanex16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlanex16_bound_ctrl_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16_bound_ctrl_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlanex16.f32(float undef, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlanex16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl_f32( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.permlanex16.f32(float undef, float [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call float @llvm.amdgcn.permlanex16.f32(float 12345.0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample a16 ; -------------------------------------------------------------------- @@ -5586,7 +5657,7 @@ define double @trig_preop_constfold_strictfp() { ; CHECK-LABEL: @trig_preop_constfold_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR15]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -555,12 +555,12 @@ define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } @@ -569,12 +569,12 @@ define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } @@ -600,7 +600,6 @@ ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg2 ; CHECK-NEXT: %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0) - %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0) store volatile float %val0, ptr addrspace(1) undef