Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -77,6 +77,10 @@ void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +ModulePass *createAMDGPUFixFunctionBitcastsPass(); +void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); +extern char &AMDGPUFixFunctionBitcastsID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; Index: lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Promote indirect (bitcast) calls to direct calls when they are statically +/// known to be direct. Required when InstCombine is not run (e.g. at OptNone) +/// because AMDGPU does not support indirect calls. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-fix-function-bitcasts" + +namespace { +class AMDGPUFixFunctionBitcasts final + : public ModulePass, + public InstVisitor { + + bool runOnModule(Module &M) override; + + bool Modified; + +public: + void visitCallSite(CallSite CS) { + if (CS.getCalledFunction()) + return; + auto Callee = dyn_cast(CS.getCalledValue()->stripPointerCasts()); + if (Callee && isLegalToPromote(CS, Callee)) { + promoteCall(CS, Callee); + Modified = true; + } + } + + static char ID; + AMDGPUFixFunctionBitcasts() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUFixFunctionBitcasts::ID = 0; +char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID; +INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE, + "Fix function bitcasts for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() { + return new AMDGPUFixFunctionBitcasts(); +} + +bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) { + Modified = false; + visit(M); + return Modified; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -166,6 +166,7 @@ initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); @@ -610,6 +611,10 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + // This must occur before inlining, as the inliner will not look through + // bitcast calls. + addPass(createAMDGPUFixFunctionBitcastsPass()); + addPass(createAtomicExpandPass()); addPass(createAMDGPULowerIntrinsicsPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -40,6 +40,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstrInfo.cpp Index: test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/call-constexpr.ll @@ -0,0 +1,140 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-fix-function-bitcasts < %s | FileCheck -check-prefix=OPT %s + +; GCN-LABEL: {{^}}test_bitcast_return_type_noinline: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_return_type_noinline( +; OPT: %val = call i32 @ret_i32_noinline() +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 { + %val = call float bitcast (i32()* @ret_i32_noinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline: +; GCN-NOT: s_getpc_b64 +; GCN-NOT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@lo+4 +; GCN-NOT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@hi+4 +; GCN-NOT: s_swappc_b64 +; OPT-LABEL: @test_bitcast_return_type_alwaysinline( +; OPT: %val = call i32 @ret_i32_alwaysinline() +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 { + %val = call float bitcast (i32()* @ret_i32_alwaysinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_type: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_argument_type( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = call i32 @ident_i32(i32 %1) +; OPT-NOT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_argument_type() #0 { + %val = call i32 bitcast (i32(i32)* @ident_i32 to i32(float)*)(float 2.0) + %op = add i32 %val, 1 + store volatile i32 %op, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_argument_and_return_types( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = call i32 @ident_i32(i32 %1) +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { + %val = call float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 +define i32 @use_workitem_id_x(i32 %arg0) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %op = add i32 %id, %arg0 + ret i32 %op +} + +; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+4 +; GCN: v_mov_b32_e32 v0, 9 +; GCN: s_swappc_b64 +; GCN: v_add_f32_e32 +; OPT-LABEL: @use_workitem_id_x( +; OPT: %val = call i32 @use_workitem_id_x(i32 9) +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { + %val = call float bitcast (i32(i32)* @use_workitem_id_x to float(i32)*)(i32 9) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_invoke: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_invoke( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = invoke i32 @ident_i32(i32 %1) +; OPT-NEXT: to label %continue unwind label %broken +; OPT-LABEL: continue.split: +; OPT: bitcast i32 %val to float +@_ZTIi = external global i8* +declare i32 @__gxx_personality_v0(...) +define amdgpu_kernel void @test_invoke() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + %val = invoke float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0) + to label %continue unwind label %broken + +broken: + landingpad { i8*, i32 } catch i8** @_ZTIi + ret void + +continue: + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; Callees appears last in source file to test that we still lower their +; arguments before we lower any calls to them. + +define i32 @ret_i32_noinline() #0 { + ret i32 4 +} + +define i32 @ret_i32_alwaysinline() #1 { + ret i32 4 +} + +define i32 @ident_i32(i32 %i) #0 { + ret i32 %i +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind noinline } +attributes #1 = { alwaysinline nounwind } +attributes #2 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -1,8 +1,4 @@ -; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s - -; FIXME: Error is misleading because it's not an indirect call. - -; CHECK: error: :0:0: in function crash_call_constexpr_cast void (): unsupported indirect call to function foo +; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called ; function is a constantexpr cast of a function. @@ -10,14 +6,18 @@ declare void @foo(float addrspace(5)*) #0 declare void @foo.varargs(...) #0 -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo +; CHECK-LABEL: @crash_call_constexpr_cast( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 ret void } -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs +; CHECK-LABEL: @crash_call_constexpr_cast_varargs( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 Index: test/CodeGen/AMDGPU/unsupported-calls.ll =================================================================== --- test/CodeGen/AMDGPU/unsupported-calls.ll +++ test/CodeGen/AMDGPU/unsupported-calls.ll @@ -53,7 +53,7 @@ declare i32 @extern_variadic(...) -; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported indirect call to function extern_variadic +; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to variadic function extern_variadic ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) { %add = fadd <4 x float> %arg0, %arg1