Index: lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -131,6 +131,9 @@ // sin/cos bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA); + // __read_pipe/__write_pipe + bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B); + // Get insertion point at entry. BasicBlock::iterator getEntryIns(CallInst * UI); // Insert an Alloc instruction. @@ -559,6 +562,67 @@ return true; } +// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe +// builtin, with appended type size and alignment arguments, where 2 or 4 +// indicates the original number of arguments. The library has optimized version +// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same +// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N +// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ..., +// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4. +bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B) { + auto *Callee = CI->getCalledFunction(); + assert(Callee->hasName() && "Invalid read_pipe/write_pipe function"); + auto *M = Callee->getParent(); + auto &Ctx = M->getContext(); + std::string Name = Callee->getName(); + auto NumArg = CI->getNumArgOperands(); + if (NumArg != 4 && NumArg != 6) + return false; + auto *PacketSize = CI->getArgOperand(NumArg - 2); + auto *PacketAlign = CI->getArgOperand(NumArg - 1); + if (!isa(PacketSize) || !isa(PacketAlign)) + return false; + unsigned Size = cast(PacketSize)->getZExtValue(); + unsigned Align = cast(PacketAlign)->getZExtValue(); + if (Size != Align || !isPowerOf2_32(Size)) + return false; + + Type *PtrElemTy; + if (Size <= 8) + PtrElemTy = Type::getIntNTy(Ctx, Size * 8); + else + PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8); + unsigned PtrArgLoc = CI->getNumArgOperands() - 3; + auto PtrArg = CI->getArgOperand(PtrArgLoc); + unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace(); + auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS); + + SmallVector ArgTys; + for (unsigned I = 0; I != PtrArgLoc; ++I) + ArgTys.push_back(CI->getArgOperand(I)->getType()); + ArgTys.push_back(PtrTy); + + Name = Name + "_" + std::to_string(Size); + + auto *FTy = FunctionType::get(Callee->getReturnType(), + ArrayRef(ArgTys), false); + auto *BCast = B.CreatePointerCast(PtrArg, PtrTy); + + SmallVector Args; + for (unsigned I = 0; I != PtrArgLoc; ++I) + Args.push_back(CI->getArgOperand(I)); + Args.push_back(BCast); + + auto *F = M->getOrInsertFunction(Name, FTy); + auto *NCI = B.CreateCall(F, Args); + NCI->setAttributes(CI->getAttributes()); + CI->replaceAllUsesWith(NCI); + CI->dropAllReferences(); + CI->eraseFromParent(); + + return true; +} + // This function returns false if no change; return true otherwise. bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { this->CI = CI; @@ -567,21 +631,28 @@ // Ignore indirect calls. if (Callee == 0) return false; - FuncInfo FInfo; - if (!parseFunctionName(Callee->getName(), &FInfo)) - return false; - - // Further check the number of arguments to see if they match. - if (CI->getNumArgOperands() != FInfo.getNumArgs()) + if (!Callee->hasName()) return false; - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); + BasicBlock *BB = CI->getParent(); IRBuilder<> B(Context); // Set the builder to the instruction after the call. B.SetInsertPoint(BB, CI->getIterator()); + auto Name = Callee->getName(); + if (Name.startswith("__read_pipe_") || Name.startswith("__write_pipe_")) + return fold_read_write_pipe(CI, B); + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + // Copy fast flags from the original call. if (const FPMathOperator *FPOp = dyn_cast(CI)) B.setFastMathFlags(FPOp->getFastMathFlags()); Index: test/CodeGen/AMDGPU/simplify-libcalls.ll =================================================================== --- test/CodeGen/AMDGPU/simplify-libcalls.ll +++ test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -681,3 +681,96 @@ } declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*) + +%opencl.pipe_t = type opaque +%opencl.reserve_id_t = type opaque + +; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) +; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]] +; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr { +entry: + %0 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)* + %1 = addrspacecast i8 addrspace(1)* %0 to i8 addrspace(4)* + %2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %1, i32 4, i32 4) #0 + %3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) + %4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %3, i32 2, i8 addrspace(4)* %1, i32 4, i32 4) #0 + tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %3, i32 4, i32 4) + ret void +} + +declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) + +declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) + +declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) + +declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) + +; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) +; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr { +entry: + %0 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)* + %1 = addrspacecast i8 addrspace(1)* %0 to i8 addrspace(4)* + %2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %1, i32 4, i32 4) #0 + %3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0 + %4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %3, i32 2, i8 addrspace(4)* %1, i32 4, i32 4) #0 + tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %3, i32 4, i32 4) #0 + ret void +} + +declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr + +declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr + +declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr + +declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr + +%struct.S = type { [100 x i32] } + +; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size +; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]] +define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 { +entry: + %0 = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)* + %1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %0, i32 1, i32 1) #0 + %2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)* + %3 = addrspacecast i8 addrspace(1)* %2 to i8 addrspace(4)* + %4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %3, i32 2, i32 2) #0 + %5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)* + %6 = addrspacecast i8 addrspace(1)* %5 to i8 addrspace(4)* + %7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %6, i32 4, i32 4) #0 + %8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)* + %9 = addrspacecast i8 addrspace(1)* %8 to i8 addrspace(4)* + %10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %9, i32 8, i32 8) #0 + %11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)* + %12 = addrspacecast i8 addrspace(1)* %11 to i8 addrspace(4)* + %13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %12, i32 16, i32 16) #0 + %14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)* + %15 = addrspacecast i8 addrspace(1)* %14 to i8 addrspace(4)* + %16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %15, i32 32, i32 32) #0 + %17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)* + %18 = addrspacecast i8 addrspace(1)* %17 to i8 addrspace(4)* + %19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %18, i32 64, i32 64) #0 + %20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)* + %21 = addrspacecast i8 addrspace(1)* %20 to i8 addrspace(4)* + %22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %21, i32 128, i32 128) #0 + %23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)* + %24 = addrspacecast i8 addrspace(1)* %23 to i8 addrspace(4)* + %25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %24, i32 400, i32 4) #0 + ret void +} + +; CGN-PRELINK: attributes #[[NOUNWIND]] = { nounwind } +attributes #0 = { nounwind }