Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -91,8 +91,12 @@ // sqrt bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); - bool insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B, - const FuncInfo &FInfo); + /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value + /// of cos, sincos call). + std::tuple insertSinCos(Value *Arg, + FastMathFlags FMF, + IRBuilder<> &B, + FunctionCallee Fsincos); // sin/cos bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); @@ -1151,32 +1155,14 @@ return false; } -bool AMDGPULibCalls::insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B, - const FuncInfo &fInfo) { - Value *Arg = Sin->getOperand(0); - assert(Arg == Cos->getOperand(0)); - +std::tuple +AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, + FunctionCallee Fsincos) { + DebugLoc DL = B.getCurrentDebugLocation(); Function *F = B.GetInsertBlock()->getParent(); - Module *M = F->getParent(); - // Merge the sin and cos. - - // for OpenCL 2.0 we have only generic implementation of sincos - // function. - // FIXME: This is not true anymore - AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); - nf.getLeads()[0].PtrKind = - AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); - FunctionCallee Fsincos = getFunction(M, nf); - if (!Fsincos) - return false; - B.SetInsertPointPastAllocas(F); - DILocation *MergedDebugLoc = - DILocation::getMergedLocation(Sin->getDebugLoc(), Cos->getDebugLoc()); - B.SetCurrentDebugLocation(MergedDebugLoc); - - AllocaInst *Alloc = B.CreateAlloca(Sin->getType(), nullptr, "__sincos_"); + AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_"); if (Instruction *ArgInst = dyn_cast(Arg)) { // If the argument is an instruction, it must dominate all uses so put our @@ -1184,7 +1170,9 @@ // if it's an argument or constant. B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); - B.SetCurrentDebugLocation(MergedDebugLoc); + + // SetInsertPoint unwelcomely always tries to set the debug loc. + B.SetCurrentDebugLocation(DL); } Value *P = Alloc; @@ -1195,25 +1183,12 @@ if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) P = B.CreateAddrSpaceCast(Alloc, PTy); - // Intersect the two sets of flags. - FastMathFlags FMF = cast(Sin)->getFastMathFlags(); - FMF &= cast(Cos)->getFastMathFlags(); - B.setFastMathFlags(FMF); - - CallInst *Call = CreateCallEx2(B, Fsincos, Arg, P); - LoadInst *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); - Reload->setDebugLoc(Cos->getDebugLoc()); - - LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *Sin << ", " << *Cos - << ") with " << *Call << '\n'); - - Sin->replaceAllUsesWith(Call); - Sin->eraseFromParent(); - - Cos->replaceAllUsesWith(Reload); - Cos->eraseFromParent(); + CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, P); - return true; + // TODO: Is it worth trying to preserve the location for the cos calls for the + // load? + LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc); + return {SinCos, LoadCos, SinCos}; } // fold sin, cos -> sincos. @@ -1231,33 +1206,92 @@ Value *CArgVal = FPOp->getOperand(0); CallInst *CI = cast(FPOp); - bool Changed = false; + Function *F = B.GetInsertBlock()->getParent(); + Module *M = F->getParent(); + + // Merge the sin and cos. + + // for OpenCL 2.0 we have only generic implementation of sincos + // function. + // FIXME: This is not true anymore + AMDGPULibFunc SinCosLibFunc(AMDGPULibFunc::EI_SINCOS, fInfo); + SinCosLibFunc.getLeads()[0].PtrKind = + AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); + FunctionCallee FSinCos = getFunction(M, SinCosLibFunc); + if (!FSinCos) + return false; + + SmallVector SinCalls; + SmallVector CosCalls; + SmallVector SinCosCalls; FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN, fInfo); const std::string PairName = PartnerInfo.mangle(); - CallInst *UI = nullptr; + StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName; + StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName(); + const std::string SinCosName = SinCosLibFunc.mangle(); + + // Intersect the two sets of flags. + FastMathFlags FMF = FPOp->getFastMathFlags(); + MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath); + + SmallVector MergeDbgLocs = {CI->getDebugLoc()}; - // TODO: Handle repeated uses, the generic implementation does. for (User* U : CArgVal->users()) { CallInst *XI = dyn_cast(U); - if (!XI || XI->isNoBuiltin()) + if (!XI || XI->getFunction() != F || XI->isNoBuiltin()) continue; Function *UCallee = XI->getCalledFunction(); - if (UCallee && UCallee->getName().equals(PairName)) - UI = XI; - else if (UI) - return Changed; + if (!UCallee) + continue; + + bool Handled = true; + + if (UCallee->getName() == SinName) + SinCalls.push_back(XI); + else if (UCallee->getName() == CosName) + CosCalls.push_back(XI); + else if (UCallee->getName() == SinCosName) + SinCosCalls.push_back(XI); + else + Handled = false; + + if (Handled) { + MergeDbgLocs.push_back(XI->getDebugLoc()); + auto *OtherOp = cast(XI); + FMF &= OtherOp->getFastMathFlags(); + FPMath = MDNode::getMostGenericFPMath( + FPMath, XI->getMetadata(LLVMContext::MD_fpmath)); + } } - if (!UI) - return Changed; + if (SinCalls.empty() || CosCalls.empty()) + return false; + + B.setFastMathFlags(FMF); + B.setDefaultFPMathTag(FPMath); + DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs); + B.SetCurrentDebugLocation(DbgLoc); + + auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos); + + auto replaceTrigInsts = [](ArrayRef Calls, Value *Res) { + for (CallInst *C : Calls) + C->replaceAllUsesWith(Res); + + // Leave the other dead instructions to avoid clobbering iterators. + }; - CallInst *Sin = isSin ? CI : UI; - CallInst *Cos = isSin ? UI : CI; - return insertSinCos(Sin, Cos, B, fInfo) || Changed; + replaceTrigInsts(SinCalls, Sin); + replaceTrigInsts(CosCalls, Cos); + replaceTrigInsts(SinCosCalls, SinCos); + + // It's safe to delete the original now. + CI->eraseFromParent(); + return true; } bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll @@ -110,6 +110,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; @@ -130,6 +131,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01) ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; @@ -159,6 +161,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8 ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]]) ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8 ; CHECK-NEXT: ret void ; Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll @@ -60,6 +60,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]]) ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -731,12 +731,14 @@ define void @sincos_f32_value_is_same_constexpr(ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_same_constexpr -; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float bitcast (i32 ptrtoint (ptr @func to i32) to float)) -; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float bitcast (i32 ptrtoint (ptr @func to i32) to float)) -; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float bitcast (i32 ptrtoint (ptr @func to i32) to float), ptr [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -767,12 +769,14 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp -; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01) -; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01) -; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -946,13 +950,18 @@ ; CHECK-LABEL: define void @sincos_f32_repeated_uses ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SIN0:%.*]] = tail call contract float @_Z3sinf(float [[X]]) -; CHECK-NEXT: store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: [[COS0:%.*]] = tail call contract float @_Z3cosf(float [[X]]) -; CHECK-NEXT: store volatile float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4 -; CHECK-NEXT: store volatile float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4 -; CHECK-NEXT: store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[__SINCOS_3:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_3]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -1016,7 +1025,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr -; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !fpmath !5 ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 @@ -1036,7 +1045,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr -; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !fpmath !6 ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 @@ -1073,17 +1082,17 @@ define void @sincos_f32_debuginfo(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) !dbg !15 { ; CHECK-LABEL: define void @sincos_f32_debuginfo -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] !dbg [[DBG5:![0-9]+]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] !dbg [[DBG7:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG12:![0-9]+]] -; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr, !dbg [[DBG12]] -; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !dbg [[DBG12]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG13:![0-9]+]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]] -; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG15:![0-9]+]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata float [[TMP2]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG13]] -; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG16:![0-9]+]] -; CHECK-NEXT: ret void, !dbg [[DBG17:![0-9]+]] +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG14:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr, !dbg [[DBG14]] +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !dbg [[DBG14]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG14]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]] +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG16:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata float [[TMP2]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]] +; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG18:![0-9]+]] +; CHECK-NEXT: ret void, !dbg [[DBG19:![0-9]+]] ; entry: %call = tail call contract float @_Z3sinf(float %x), !dbg !19 @@ -1100,9 +1109,9 @@ ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !18 +; CHECK-NEXT: [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !5 ; CHECK-NEXT: store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !19 +; CHECK-NEXT: [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !6 ; CHECK-NEXT: [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4 ; CHECK-NEXT: store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret float [[SIN1]] @@ -1122,10 +1131,10 @@ ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !18 +; CHECK-NEXT: [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !5 ; CHECK-NEXT: store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: [[COS_TMP_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP]] to ptr -; CHECK-NEXT: [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !19 +; CHECK-NEXT: [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !6 ; CHECK-NEXT: [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4 ; CHECK-NEXT: store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret float [[SIN1]] @@ -1276,17 +1285,16 @@ ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS_TMP0:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[COS_TMP1:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: [[SIN0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP0]]) -; CHECK-NEXT: [[SIN1:%.*]] = call contract float @_Z3sinf(float [[X]]) -; CHECK-NEXT: store float [[SIN1]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: [[COS_TMP1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP1]] to ptr -; CHECK-NEXT: [[COS1:%.*]] = call contract float @_Z3cosf(float [[X]]) -; CHECK-NEXT: store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4 -; CHECK-NEXT: [[SIN2:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP1_CAST]]) +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: [[COS2:%.*]] = load float, ptr addrspace(5) [[COS_TMP0]], align 4 ; CHECK-NEXT: store float [[COS2]], ptr addrspace(1) [[COS_OUT]], align 4 -; CHECK-NEXT: ret float [[SIN2]] +; CHECK-NEXT: ret float [[TMP1]] ; entry: %cos.tmp0 = alloca float, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]]) ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; @@ -39,6 +40,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8 ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]]) ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8 ; CHECK-NEXT: ret void ; @@ -59,6 +61,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]]) ; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; @@ -79,6 +82,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8 ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]]) ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8 ; CHECK-NEXT: ret void ;