Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -51,6 +51,10 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +ModulePass *createAMDGPULowerIntrinsicsPass(); +void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); +extern char &AMDGPULowerIntrinsicsID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -451,10 +451,11 @@ PredictableSelectIsExpensive = false; - // FIXME: Need to really handle these. - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; + // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry + // about these during lowering. + MaxStoresPerMemcpy = 0xffffffff; + MaxStoresPerMemmove = 0xffffffff; + MaxStoresPerMemset = 0xffffffff; setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::SHL); Index: lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -0,0 +1,132 @@ +//===-- AMDGPULowerIntrinsicsPass.cpp -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "amdgpu-lower-intrinsics" + +using namespace llvm; + +namespace { + +const unsigned MaxStaticSize = 1024; + +class AMDGPULowerIntrinsics : public ModulePass { +public: + static char ID; + + AMDGPULowerIntrinsics() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + StringRef getPassName() const override { + return "AMDGPU Lower Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPULowerIntrinsics::ID = 0; + +char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; + +INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, + "Lower intrinsics", false, false) + +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +static bool shouldExpandOperationWithSize(Value *Size) { + ConstantInt *CI = dyn_cast(Size); + return !CI || (CI->getZExtValue() > MaxStaticSize); +} + +static bool expandMemIntrinsicUses(Function &F) { + Intrinsic::ID ID = F.getIntrinsicID(); + bool Changed; + + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + Instruction *Inst = cast(*I); + ++I; + + switch (ID) { + case Intrinsic::memcpy: { + auto *Memcpy = cast(Inst); + if (shouldExpandOperationWithSize(Memcpy->getLength())) { + convertMemCpyToLoop(Memcpy); + Changed = true; + Memcpy->eraseFromParent(); + } + + break; + } + case Intrinsic::memmove: { + auto *Memmove = cast(Inst); + if (shouldExpandOperationWithSize(Memmove->getLength())) { + convertMemMoveToLoop(Memmove); + Changed = true; + Memmove->eraseFromParent(); + } + + break; + } + case Intrinsic::memset: { + auto *Memset = cast(Inst); + if (shouldExpandOperationWithSize(Memset->getLength())) { + convertMemSetToLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + } + + break; + } + default: + break; + } + } + + return Changed; +} + +bool AMDGPULowerIntrinsics::runOnModule(Module &M) { + bool Changed = false; + + dbgs() << "RUnning\n"; + + for (Function &F : M) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + if (expandMemIntrinsicUses(F)) + Changed = true; + break; + default: + break; + } + } + + return Changed; +} + +ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { + return new AMDGPULowerIntrinsics(); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -108,6 +108,7 @@ initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -472,6 +473,8 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPULowerIntrinsicsPass()); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -45,6 +45,7 @@ AMDGPUTargetObjectFile.cpp AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp + AMDGPULowerIntrinsics.cpp AMDGPUMCInstLower.cpp AMDGPUMachineFunction.cpp AMDGPUUnifyMetadata.cpp Index: test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -0,0 +1,117 @@ +; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s + +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 + +declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1 + +; Test the upper bound for sizes to leave +; OPT-LABEL: @max_size_small_static_memcpy_caller0( +; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) +define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) + ret void +} + +; Smallest static size which will be expanded +; OPT-LABEL: @min_size_large_static_memcpy_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT-NEXT: load i8 +; OPT: getelementptr +; OPT-NEXT: store i8 +define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @max_size_small_static_memmove_caller0( +; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) +define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @min_size_large_static_memmove_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT-NEXT: load i8 +; OPT: getelementptr +; OPT-NEXT: store i8 +define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @max_size_small_static_memset_caller0( +; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) +define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @min_size_large_static_memset_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT: store i8 +define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @variable_memcpy_caller0( +; OPT-NOT: call +; OPT: phi +define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @variable_memcpy_caller1( +; OPT-NOT: call +; OPT: phi +define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @memcpy_multi_use_one_function( +; OPT-NOT: call +; OPT: phi +; OPT-NOT: call +; OPT: phi +; OPT-NOT: call +define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @memcpy_alt_type( +; OPT: phi +; OPT: getelementptr inbounds i8, i8 addrspace(3)* +; OPT: load i8, i8 addrspace(3)* +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: store i8 +define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { + call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false) + ret void +} + +; One of the uses in the function should be expanded, the other left alone. +; OPT-LABEL: @memcpy_multi_use_one_function_keep_small( +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: load i8, i8 addrspace(1)* +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: store i8 + +; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) +define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind }