Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -16756,16 +16756,9 @@ const CallExpr *E = nullptr) { auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); auto *Call = CGF.Builder.CreateCall(F); - Call->addRetAttr( - Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); - Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); if (!E) return Call; - QualType BuiltinRetType = E->getType(); - auto *RetTy = cast(CGF.ConvertType(BuiltinRetType)); - if (RetTy == Call->getType()) - return Call; - return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); + return CGF.Builder.CreateAddrSpaceCast(Call, CGF.ConvertType(E->getType())); } Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { Index: clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefix=PRECOV5 %s +// RUN: | FileCheck -check-prefixes=PRECOV5,CHECK %s // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefix=COV5 %s +// RUN: | FileCheck -check-prefixes=COV5,CHECK %s #include "Inputs/cuda.h" // PRECOV5-LABEL: test_get_workgroup_size -// PRECOV5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// PRECOV5: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 // PRECOV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 @@ -36,4 +36,7 @@ } } +// COV4: declare align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COV5: declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu =================================================================== --- clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -21,7 +21,7 @@ // CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr // CHECK-NEXT: store ptr [[TMP2]], ptr [[DISPATCH_PTR_ASCAST]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8 @@ -154,7 +154,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr // CHECK-NEXT: store ptr [[TMP1]], ptr [[X_ASCAST]], align 8 // CHECK-NEXT: ret void Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -516,12 +516,15 @@ } // CHECK-LABEL: @test_dispatch_ptr -// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() void test_dispatch_ptr(__constant unsigned char ** out) { *out = __builtin_amdgcn_dispatch_ptr(); } +// CHECK: declare align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + + // CHECK-LABEL: @test_queue_ptr // CHECK: call ptr addrspace(4) @llvm.amdgcn.queue.ptr() void test_queue_ptr(__constant unsigned char ** out) @@ -543,6 +546,9 @@ *out = __builtin_amdgcn_implicitarg_ptr(); } +// CHECK: declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + + // CHECK-LABEL: @test_get_group_id( // CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x() // CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y() @@ -583,7 +589,7 @@ } // CHECK-LABEL: @test_get_workgroup_size( -// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 4 // CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 6 @@ -601,7 +607,7 @@ } // CHECK-LABEL: @test_get_grid_size( -// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 12 // CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 16 Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -94,6 +94,11 @@ int Align = align; } +class Dereferenceable : IntrinsicProperty { + int ArgNo = idx.Value; + int Bytes = bytes; +} + // Returned - The specified argument is always the return value of the // intrinsic. class Returned : IntrinsicProperty { Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -141,8 +141,10 @@ <"__builtin_amdgcn_workgroup_id">; def int_amdgcn_dispatch_ptr : + ClangBuiltin<"__builtin_amdgcn_dispatch_ptr">, DefaultAttrsIntrinsic<[LLVMQualPointerType], [], - [Align, IntrNoMem, IntrSpeculatable]>; + [Align, Dereferenceable, IntrNoMem, + IntrSpeculatable]>; def int_amdgcn_queue_ptr : ClangBuiltin<"__builtin_amdgcn_queue_ptr">, @@ -154,6 +156,8 @@ DefaultAttrsIntrinsic<[LLVMQualPointerType], [], [Align, IntrNoMem, IntrSpeculatable]>; +// TODO: This is 8 for amdhsa. For others it's 4 for no real reason. +// This should also be dereferenceable(256) for amdhsa COV5. def int_amdgcn_implicitarg_ptr : ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, DefaultAttrsIntrinsic<[LLVMQualPointerType], [], Index: llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll +++ llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -47,7 +47,7 @@ ; GCN-LABEL: @get_local_size_z( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 -; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 8 ; GCN-NEXT: store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; @@ -139,7 +139,7 @@ ; GCN-LABEL: @get_work_group_size_z( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 -; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4 +; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 8 ; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; Index: llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp =================================================================== --- llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp +++ llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp @@ -1012,8 +1012,8 @@ GISelKnownBits Info(*MF); - EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyDispatchPtr)); - EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyQueuePtr)); + EXPECT_EQ(Align(8), Info.computeKnownAlignment(CopyDispatchPtr)); + EXPECT_EQ(Align(8), Info.computeKnownAlignment(CopyQueuePtr)); EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyKernargSegmentPtr)); EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyImplicitArgPtr)); EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyImplicitBufferPtr)); Index: llvm/utils/TableGen/CodeGenIntrinsics.h =================================================================== --- llvm/utils/TableGen/CodeGenIntrinsics.h +++ llvm/utils/TableGen/CodeGenIntrinsics.h @@ -119,7 +119,8 @@ WriteOnly, ReadNone, ImmArg, - Alignment + Alignment, + Dereferenceable }; struct ArgAttribute { Index: llvm/utils/TableGen/CodeGenTarget.cpp =================================================================== --- llvm/utils/TableGen/CodeGenTarget.cpp +++ llvm/utils/TableGen/CodeGenTarget.cpp @@ -923,6 +923,10 @@ unsigned ArgNo = R->getValueAsInt("ArgNo"); uint64_t Align = R->getValueAsInt("Align"); addArgAttribute(ArgNo, Alignment, Align); + } else if (R->isSubClassOf("Dereferenceable")) { + unsigned ArgNo = R->getValueAsInt("ArgNo"); + uint64_t Bytes = R->getValueAsInt("Bytes"); + addArgAttribute(ArgNo, Dereferenceable, Bytes); } else llvm_unreachable("Unknown property!"); } Index: llvm/utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- llvm/utils/TableGen/IntrinsicEmitter.cpp +++ llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -726,6 +726,10 @@ OS << " Attribute::get(C, Attribute::Alignment, " << Attr.Value << "),\n"; break; + case CodeGenIntrinsic::Dereferenceable: + OS << " Attribute::get(C, Attribute::Dereferenceable, " + << Attr.Value << "),\n"; + break; } } OS << " });\n";