Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -73,6 +73,10 @@ void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +ModulePass *createAMDGPUFixFunctionBitcastsPass(); +void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); +extern char &AMDGPUFixFunctionBitcastsID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; Index: lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -0,0 +1,75 @@ +//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Promote indirect (bitcast) calls to direct calls when they are statically +/// known to be direct. Required when InstCombine is not run (e.g. at OptNone) +/// because AMDGPU does not support indirect calls. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-fix-function-bitcasts" + +namespace { +class AMDGPUFixFunctionBitcasts final + : public ModulePass, + public InstVisitor { + + StringRef getPassName() const override { + return "AMDGPU Fix Function Bitcasts"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + ModulePass::getAnalysisUsage(AU); + } + + bool runOnModule(Module &M) override; + + bool Modified; + +public: + void visitCallInst(CallInst &CI) { + CallSite CS(&CI); + if (!CS.getCalledValue()) + return; + if (CS.getCalledFunction()) + return; + auto Callee = dyn_cast(CS.getCalledValue()->stripPointerCasts()); + if (Callee && isLegalToPromote(CS, Callee)) { + promoteCall(CS, Callee); + Modified = true; + } + } + + static char ID; + AMDGPUFixFunctionBitcasts() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUFixFunctionBitcasts::ID = 0; +char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID; +INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE, + "Fix function bitcasts for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() { + return new AMDGPUFixFunctionBitcasts(); +} + +bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) { + Modified = false; + visit(M); + return Modified; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -159,6 +159,7 @@ initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); @@ -602,6 +603,10 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + // This must occur before inlining, as the inliner will not look through + // bitcast calls. + addPass(createAMDGPUFixFunctionBitcastsPass()); + addPass(createAMDGPULowerIntrinsicsPass()); if (TM.getTargetTriple().getArch() == Triple::r600 || Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -39,6 +39,7 @@ AMDGPUAsmPrinter.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstrInfo.cpp Index: lib/Transforms/Utils/CallPromotionUtils.cpp =================================================================== --- lib/Transforms/Utils/CallPromotionUtils.cpp +++ lib/Transforms/Utils/CallPromotionUtils.cpp @@ -177,8 +177,8 @@ InsertBefore = &*std::next(CS.getInstruction()->getIterator()); // Bitcast the return value to the correct type. - auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(), - RetTy, "", InsertBefore); + auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "", + InsertBefore); if (RetBitCast) *RetBitCast = Cast; @@ -321,12 +321,14 @@ const char **FailureReason) { assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted"); + auto &DL = Callee->getParent()->getDataLayout(); + // Check the return type. The callee's return value type must be bitcast // compatible with the call site's type. Type *CallRetTy = CS.getInstruction()->getType(); Type *FuncRetTy = Callee->getReturnType(); if (CallRetTy != FuncRetTy) - if (!CastInst::isBitCastable(FuncRetTy, CallRetTy)) { + if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) { if (FailureReason) *FailureReason = "Return type mismatch"; return false; @@ -351,7 +353,7 @@ Type *ActualTy = CS.getArgument(I)->getType(); if (FormalTy == ActualTy) continue; - if (!CastInst::isBitCastable(ActualTy, FormalTy)) { + if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) { if (FailureReason) *FailureReason = "Argument type mismatch"; return false; @@ -396,8 +398,8 @@ Type *FormalTy = CalleeType->getParamType(ArgNo); Type *ActualTy = Arg->getType(); if (FormalTy != ActualTy) { - auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "", - CS.getInstruction()); + auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", + CS.getInstruction()); CS.setArgument(ArgNo, Cast); } } Index: test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/call-constexpr.ll @@ -0,0 +1,95 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_bitcast_return_type_noinline: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 { + %val = call float bitcast (i32()* @ret_i32_noinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline: +; GCN-NOT: s_getpc_b64 +; GCN-NOT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@lo+4 +; GCN-NOT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@hi+4 +; GCN-NOT: s_swappc_b64 +define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 { + %val = call float bitcast (i32()* @ret_i32_alwaysinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_type: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_bitcast_argument_type() #0 { + %val = call i32 bitcast (i32(i32)* @ident_i32 to i32(float)*)(float 2.0) + %op = add i32 %val, 1 + store volatile i32 %op, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { + %val = call float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 +define i32 @use_workitem_id_x(i32 %arg0) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %op = add i32 %id, %arg0 + ret i32 %op +} + +; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+4 +; GCN: v_mov_b32_e32 v0, 9 +; GCN: s_swappc_b64 +; GCN: v_add_f32_e32 +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { + %val = call float bitcast (i32(i32)* @use_workitem_id_x to float(i32)*)(i32 9) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; Callees appears last in source file to test that we still lower their +; arguments before we lower any calls to them. + +define i32 @ret_i32_noinline() #0 { + ret i32 4 +} + +define i32 @ret_i32_alwaysinline() #1 { + ret i32 4 +} + +define i32 @ident_i32(i32 %i) #0 { + ret i32 %i +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind noinline } +attributes #1 = { alwaysinline nounwind } +attributes #2 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -1,8 +1,4 @@ -; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s - -; FIXME: Error is misleading because it's not an indirect call. - -; CHECK: error: :0:0: in function crash_call_constexpr_cast void (): unsupported indirect call to function foo +; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called ; function is a constantexpr cast of a function. @@ -10,14 +6,18 @@ declare void @foo(float addrspace(5)*) #0 declare void @foo.varargs(...) #0 -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo +; CHECK-LABEL: @crash_call_constexpr_cast( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 ret void } -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs +; CHECK-LABEL: @crash_call_constexpr_cast_varargs( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 Index: test/CodeGen/AMDGPU/unsupported-calls.ll =================================================================== --- test/CodeGen/AMDGPU/unsupported-calls.ll +++ test/CodeGen/AMDGPU/unsupported-calls.ll @@ -53,7 +53,7 @@ declare i32 @extern_variadic(...) -; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported indirect call to function extern_variadic +; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to variadic function extern_variadic ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) { %add = fadd <4 x float> %arg0, %arg1