Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -92,6 +92,9 @@ ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +void initializeAMDGPUDeviceLibsPass(PassRegistry &); +ModulePass *createAMDGPUDeviceLibsPass(TargetMachine& tm); + void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; Index: lib/Target/AMDGPU/AMDGPUDeviceLibs.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUDeviceLibs.h @@ -0,0 +1,17 @@ +#ifndef AMDGPU_DEVICE_LIBS_H +#define AMDGPU_DEVICE_LIBS_H + +enum AMDGPUOptionMask { + AMD_OPT = 1, + UNSAFE_FP_MATH = 2, + NO_NANS_FP_MATH = 4, + NO_INFS_FP_MATH = 8, + FAST_FMA32 = 16, + FAST_FMA64 = 32, + FP32_DENORMS = 64, + FP64_DENORMS = 128, + ISA_VERSION_SHIFT = 8, + ISA_VERSION_WIDTH = 4, +}; + +#endif /* AMDGPU_DEVICE_LIBS_H */ Index: lib/Target/AMDGPU/AMDGPUDeviceLibsPass.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUDeviceLibsPass.cpp @@ -0,0 +1,278 @@ +#include "AMDGPU.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Verifier.h" +#include "llvm/CodeGen/Passes.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUDeviceLibs.h" + +#define DEBUG_TYPE "amdgpu-device-libs" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceLibs : public ModulePass { + const TargetMachine* TM; + + uint64_t KernelOptionMask(Function& F); + + bool IsAttributeSet(Function &F, StringRef Attr) { + return + F.hasFnAttribute(Attr) && + (F.getFnAttribute(Attr).getValueAsString() != "false"); + } + +public: + static char ID; + AMDGPUDeviceLibs(const TargetMachine* TM_ = 0) + : ModulePass(ID), TM(TM_) {} + + bool runOnModule(Module &M) override; +}; + +uint64_t AMDGPUDeviceLibs::KernelOptionMask(Function& F) { + uint64_t mask = 0; + mask |= AMD_OPT; // Always set. + if (IsAttributeSet(F, "unsafe-fp-math")) { mask |= UNSAFE_FP_MATH; } + if (IsAttributeSet(F, "no-nans-fp-math")) { mask |= NO_NANS_FP_MATH; } + if (IsAttributeSet(F, "no-infs-fp-math")) { mask |= NO_INFS_FP_MATH; } + const SISubtarget* ST = static_cast(TM->getSubtargetImpl(F)); + if (ST->hasFastFMAF32()) { mask |= FAST_FMA32; } + AMDGPU::IsaVersion ISA = ST->getIsaVersion(); + mask |= (ISA.Major * 100 + ISA.Minor * 10 + ISA.Stepping) << ISA_VERSION_SHIFT; + + return mask; +} + +struct NewArgInfo { + Type *Ty; + const char *Name; +}; + +FunctionType *GetFunctionTypeWithNewArguments(FunctionType *FTy, + ArrayRef NArgs) +{ + unsigned NumNArgs = NArgs.size(); + SmallVector ArgTypes; + ArgTypes.reserve(NumNArgs + FTy->getNumParams()); + + unsigned Idx; + for (Idx = 0; Idx != NumNArgs; ++Idx) + ArgTypes.push_back(NArgs[Idx].Ty); + + ArgTypes.insert(ArgTypes.end(), FTy->param_begin(), FTy->param_end()); + + + return FunctionType::get(FTy->getReturnType(), ArgTypes, FTy->isVarArg()); +} + +static AttributeSet ShiftAttributes(LLVMContext &Ctx, + const AttributeSet& PAL, + unsigned NumParams, + unsigned Shift) +{ + SmallVector Attributes; + Attributes.push_back(AttributeSet::get(Ctx, PAL.getRetAttributes())); + for (unsigned Idx = 1; Idx <= NumParams; ++Idx) { + AttributeSet params = PAL.getParamAttributes(Idx); + AttrBuilder B(params, Idx); + Attributes.push_back(AttributeSet::get(Ctx, Idx + Shift, B)); + } + Attributes.push_back(AttributeSet::get(Ctx, PAL.getFnAttributes())); + return AttributeSet::get(Ctx, Attributes); +} + +Function *CreateFunctionWithNewArguments(Function* F, + ArrayRef NArgs) +{ + unsigned NumNArgs = NArgs.size(); + + // Create a new function type + FunctionType *NFTy = cast(F->getType()->getElementType()); + NFTy = GetFunctionTypeWithNewArguments(NFTy, NArgs); + + // Create the new function + Function *NF = Function::Create(NFTy, F->getLinkage()); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + + Function::arg_iterator NI = NF->arg_begin(); + for (unsigned Idx = 0; Idx != NumNArgs; ++Idx, ++NI) + if (const char *NName = NArgs[Idx].Name) + NI->setName(NName); + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->takeName(F); + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Loop over the argument list, transferring uses of the old arguments over + // to the new arguments, also transferring over the names as well. + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++NI) { + I->replaceAllUsesWith(&*NI); + NI->takeName(&*I); + } + + NF->GlobalValue::copyAttributesFrom(F); + NF->setAttributes(ShiftAttributes(F->getParent()->getContext(), + F->getAttributes(), + (unsigned)F->arg_size(), + NumNArgs)); + + NF->setCallingConv(F->getCallingConv()); + if (F->hasGC()) + NF->setGC(F->getGC()); + else + NF->clearGC(); + + return NF; +} + +void AddArgumentsToCallSite(CallSite &CS, SmallVectorImpl &Args, + Value *Callee) +{ + Instruction *Call = CS.getInstruction(); + unsigned NumNArgs = (unsigned)Args.size(); + + Args.insert(Args.end(), CS.arg_begin(), CS.arg_end()); + + auto NPAL = ShiftAttributes(CS.getCaller()->getParent()->getContext(), + CS.getAttributes(), + (unsigned)CS.arg_size(), NumNArgs); + + Instruction *NCall; + if (InvokeInst *II = dyn_cast(Call)) { + InvokeInst *NII = InvokeInst::Create(Callee, II->getNormalDest(), + II->getUnwindDest(), + Args, "", Call); + NII->setCallingConv(II->getCallingConv()); + NII->setAttributes(NPAL); + + NCall = NII; + } else { + CallInst *CI = cast(Call); + CallInst *NCI = CallInst::Create(Callee, Args, "", Call); + NCI->setCallingConv(CI->getCallingConv()); + NCI->setAttributes(NPAL); + if (CI->isTailCall()) + NCI->setTailCall(); + + NCall = NCI; + } + NCall->setDebugLoc(Call->getDebugLoc()); + + if (!Call->use_empty()) + Call->replaceAllUsesWith(NCall); + + NCall->takeName(Call); + + // Remove the old call from the program, reducing the use-count of F. + Call->eraseFromParent(); +} + +bool AMDGPUDeviceLibs::runOnModule(Module &M) { + bool Changed = false; + SmallVector ToRemove; + + std::set Worklist, Functions; + std::set Calls; + + for (Function& F : M.getFunctionList()) { + Worklist.insert(&F); + } + + while (!Worklist.empty()) { + Function* F = *Worklist.begin(); + Worklist.erase(F); + Changed = false; + for (Instruction &I : instructions(F)) { + CallInst *Call = dyn_cast(&I); + if (!Call) + continue; + Function *Callee = Call->getCalledFunction(); + if (!Callee) + continue; + if (Callee->getName() == "__oclc_option_mask" || + Functions.find(Callee) != Functions.end()) { + Changed |= Functions.insert(F).second; + Changed |= Calls.insert(Call).second; + } + } + if (Changed) { + for (Use& US : F->uses()) { + User* U = US.getUser(); + if (Instruction *Inst = dyn_cast(U)) { + Function* Caller = Inst->getFunction(); + if (Functions.find(Caller) == Functions.end()) { + Functions.insert(Caller); + Worklist.insert(Caller); + } + } + } + } + } + + std::map NewFunctions; + + for (Function* F : Functions) { + if (F->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || + F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { + continue; + } + SmallVector NewArgsInfo; + NewArgInfo Info; + Info.Ty = IntegerType::get(M.getContext(), 64); + Info.Name = "__option_mask"; + NewArgsInfo.push_back(Info); + Function* NF = CreateFunctionWithNewArguments(F, NewArgsInfo); + NewFunctions[F] = NF; + } + + for (CallInst* Call : Calls) { + Function* Caller = Call->getFunction(); + Function* Callee = Call->getCalledFunction(); + Value* OptionMask = nullptr; + if (Caller->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || + Caller->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { + OptionMask = ConstantInt::get(Call->getType(), KernelOptionMask(*Caller)); + } else { + for (Argument& arg : Caller->args()) { + if (arg.getName() == "__option_mask") { + OptionMask = &arg; + break; + } + } + } + assert(OptionMask); + + if (Callee->getName() == "__oclc_option_mask") { + Call->replaceAllUsesWith(OptionMask); + ToRemove.push_back(Call); + } else { + CallSite CS(Call); + Function* NewCallee = NewFunctions[Callee]; + Call->replaceUsesOfWith(Callee, NewCallee); + SmallVector NewArgs; + NewArgs.push_back(OptionMask); + AddArgumentsToCallSite(CS, NewArgs, NewCallee); + } + } + + for (Instruction *I : ToRemove) + I->eraseFromParent(); + return ToRemove.size() > 0; +} + +char AMDGPUDeviceLibs::ID = 0; +} + +ModulePass *llvm::createAMDGPUDeviceLibsPass(TargetMachine& tm) { + return new AMDGPUDeviceLibs(&tm); +} + +INITIALIZE_TM_PASS(AMDGPUDeviceLibs, DEBUG_TYPE, + "AMDGPU Device libraries pass", false, false) Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -50,6 +50,7 @@ TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + void addPreLinkPasses(PassManagerBase &) override; }; //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -89,6 +89,7 @@ initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); + initializeAMDGPUDeviceLibsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -169,6 +170,10 @@ FSAttr.getValueAsString(); } +void AMDGPUTargetMachine::addPreLinkPasses(PassManagerBase &PM) { + PM.add(createAMDGPUDeviceLibsPass(*this)); +} + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -34,6 +34,7 @@ AMDGPUAnnotateUniformValues.cpp AMDGPUAsmPrinter.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceLibsPass.cpp AMDGPUFrameLowering.cpp AMDGPUTargetObjectFile.cpp AMDGPUIntrinsicInfo.cpp Index: test/CodeGen/AMDGPU/device-libs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/device-libs.ll @@ -0,0 +1,85 @@ +; RUN: opt -S -verify-machineinstrs -amdgpu-device-libs -inline -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck --check-prefix=SI %s +; RUN: opt -S -verify-machineinstrs -amdgpu-device-libs -inline -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=VI %s + +declare extern_weak i64 @__oclc_option_mask() local_unnamed_addr + +define i64 @get_option_mask() local_unnamed_addr #0 { +entry: + %call = tail call i64 @__oclc_option_mask() #1 + ret i64 %call +} + +define i64 @get_option_mask2() local_unnamed_addr #0 { +entry: + %call = tail call i64 @get_option_mask() #1 + ret i64 %call +} + + +; FUNC-LABEL: {{^}}default_kernel: +; SI: store i64 179473, i64 addrspace(1)* %out +; VI: store i64 205569, i64 addrspace(1)* %out +define amdgpu_kernel void @default_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 { +entry: + %call = tail call i64 @__oclc_option_mask() #1 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}default_call_kernel: +; SI: store i64 179473, i64 addrspace(1)* %out +; VI: store i64 205569, i64 addrspace(1)* %out +define amdgpu_kernel void @default_call_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 { +entry: + %call = tail call i64 @get_option_mask() #0 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}default_call2_kernel: +; SI: store i64 179473, i64 addrspace(1)* %out +; VI: store i64 205569, i64 addrspace(1)* %out +define amdgpu_kernel void @default_call2_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #2 { +entry: + %call = tail call i64 @get_option_mask2() #0 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + + +; FUNC-LABEL: {{^}}unsafe_fp_kernel: +; SI: store i64 179475, i64 addrspace(1)* %out +; VI: store i64 205571, i64 addrspace(1)* %out +define amdgpu_kernel void @unsafe_fp_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #3 { +entry: + %call = tail call i64 @__oclc_option_mask() #1 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}no_nans_fp_math_kernel: +; SI: store i64 179477, i64 addrspace(1)* %out +; VI: store i64 205573, i64 addrspace(1)* %out +define amdgpu_kernel void @no_nans_fp_math_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #4 { +entry: + %call = tail call i64 @__oclc_option_mask() #1 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}no_infs_fp_math_kernel: +; SI: store i64 179481, i64 addrspace(1)* %out +; VI: store i64 205577, i64 addrspace(1)* %out +define amdgpu_kernel void @no_infs_fp_math_kernel(i64 addrspace(1)* nocapture %out) local_unnamed_addr #5 { +entry: + %call = tail call i64 @__oclc_option_mask() #1 + store i64 %call, i64 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } +attributes #3 = { nounwind "unsafe-fp-math"="true" } +attributes #4 = { nounwind "no-nans-fp-math"="true" } +attributes #5 = { nounwind "no-infs-fp-math"="true" }