diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -801,6 +801,8 @@ auto &DL = M->getDataLayout(); auto Int64Ty = Type::getInt64Ty(Func.getContext()); + Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); + if (HiddenArgNumBytes >= 8) emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args); @@ -973,6 +975,11 @@ msgpack::ArrayDocNode Args) { auto &Func = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget(); + + // No implicit kernel argument is used. + if (ST.getImplicitArgNumBytes(Func) == 0) + return; + const Module *M = Func.getParent(); auto &DL = M->getDataLayout(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); @@ -981,6 +988,7 @@ auto Int32Ty = Type::getInt32Ty(Func.getContext()); auto Int16Ty = Type::getInt16Ty(Func.getContext()); + Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args); diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -24,8 +24,8 @@ } ; CHECK: - .args: -; CHECK: .value_kind: hidden_multigrid_sync_arg -; PRE-GFX9: .offset: 200 +; PRE-GFX9: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9-NEXT: .offset: 200 ; PRE-GFX9-NEXT: .size: 4 ; PRE-GFX9-NEXT: .value_kind: hidden_private_base ; PRE-GFX9-NEXT: .offset: 204 @@ -44,8 +44,8 @@ } ; CHECK: - .args: -; CHECK: .value_kind: hidden_multigrid_sync_arg -; PRE-GFX9: .offset: 200 +; PRE-GFX9: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9-NEXT: .offset: 200 ; PRE-GFX9-NEXT: .size: 4 ; PRE-GFX9-NEXT: .value_kind: hidden_private_base ; PRE-GFX9-NEXT: .offset: 204 @@ -64,8 +64,8 @@ } ; CHECK: - .args: -; CHECK: .value_kind: hidden_multigrid_sync_arg -; PRE-GFX9: .offset: 192 +; PRE-GFX9: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9-NEXT: .offset: 192 ; PRE-GFX9-NEXT: .size: 4 ; PRE-GFX9-NEXT: .value_kind: hidden_private_base ; PRE-GFX9-NEXT: .offset: 196 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s - -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s - - -; CHECK: amdhsa.kernels: -; CHECK-NEXT: - .args: -; CHECK-NEXT: - .address_space: global -; CHECK-NEXT: .name: r -; CHECK-NEXT: .offset: 0 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: global_buffer -; CHECK-NEXT: - .address_space: global -; CHECK-NEXT: .name: a -; CHECK-NEXT: .offset: 8 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: global_buffer -; CHECK-NEXT: - .address_space: global -; CHECK-NEXT: .name: b -; CHECK-NEXT: .offset: 16 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: global_buffer -; CHECK-NEXT: - .offset: 24 -; CHECK-NEXT: .size: 4 -; CHECK-NEXT: .value_kind: hidden_block_count_x -; CHECK-NEXT: - .offset: 28 -; CHECK-NEXT: .size: 4 -; CHECK-NEXT: .value_kind: hidden_block_count_y -; CHECK-NEXT: - .offset: 32 -; CHECK-NEXT: .size: 4 -; CHECK-NEXT: .value_kind: hidden_block_count_z -; CHECK-NEXT: - .offset: 36 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_group_size_x -; CHECK-NEXT: - .offset: 38 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_group_size_y -; CHECK-NEXT: - .offset: 40 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_group_size_z -; CHECK-NEXT: - .offset: 42 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_remainder_x -; CHECK-NEXT: - .offset: 44 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_remainder_y -; CHECK-NEXT: - .offset: 46 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_remainder_z -; CHECK-NEXT: - .offset: 64 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_global_offset_x -; CHECK-NEXT: - .offset: 72 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_global_offset_y -; CHECK-NEXT: - .offset: 80 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_global_offset_z -; CHECK-NEXT: - .offset: 88 -; CHECK-NEXT: .size: 2 -; CHECK-NEXT: .value_kind: hidden_grid_dims -; CHECK-NEXT: - .address_space: global -; CHECK-NEXT: .offset: 112 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg -; GFX8-NEXT: - .offset: 216 -; GFX8-NEXT: .size: 4 -; GFX8-NEXT: .value_kind: hidden_private_base -; GFX8-NEXT: - .offset: 220 -; GFX8-NEXT: .size: 4 -; GFX8-NEXT: .value_kind: hidden_shared_base - -; CHECK: .name: test_v5_reduced_hidden -; CHECK: .symbol: test_v5_reduced_hidden.kd - -; CHECK: amdhsa.version: -; CHECK-NEXT: - 1 -; CHECK-NEXT: - 2 -define amdgpu_kernel void @test_v5_reduced_hidden( - half addrspace(1)* %r, - half addrspace(1)* %a, - half addrspace(1)* %b) { -entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b - %r.val = fadd half %a.val, %b.val - store half %r.val, half addrspace(1)* %r - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=CHECK %s + + +; CHECK-LABEL: test_unaligned_to_eight: +; CHECK: .amdhsa_kernarg_size 264 +define amdgpu_kernel void @test_unaligned_to_eight(i32 %four) { + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef + ret void +} + + +; CHECK-LABEL: test_aligned_to_eight: +; CHECK: .amdhsa_kernarg_size 264 +define amdgpu_kernel void @test_aligned_to_eight(i64 %eight) { + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef + ret void +} + +; CHECK-LABEL: amdhsa.kernels: +; CHECK: - .args: +; CHECK-NEXT: - .name: four +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 12 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 264 +; CHECK-LABEL: .name: test_unaligned_to_eight + +; CHECK: - .args: +; CHECK-NEXT: - .name: eight +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 12 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK: .kernarg_segment_align: 8 +; CHECK-NEXT: .kernarg_segment_size: 264 +; CHECK-LABEL: .name: test_aligned_to_eight + +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()