diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -9,14 +9,20 @@ // // Performs general IR level optimizations on SVE intrinsics. // -// The main goal of this pass is to remove unnecessary reinterpret -// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// This pass performs the following optimizations: // -// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) -// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) +// - removes unnecessary reinterpret intrinsics +// (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) +// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) // -// This pass also looks for ptest intrinsics & phi instructions where the -// operands are being needlessly converted to and from svbool_t. +// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g: +// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +// ; (%1 can be replaced with a reinterpret of %2) +// +// - optimizes ptest intrinsics and phi instructions where the operands are +// being needlessly converted to and from svbool_t. // //===----------------------------------------------------------------------===// @@ -56,8 +62,17 @@ private: static IntrinsicInst *isReinterpretToSVBool(Value *V); + bool coalescePTrueIntrinsicCalls(BasicBlock &BB, + SmallSetVector &PTrues); + bool optimizePTrueIntrinsicCalls(SmallSetVector &Functions); + + /// Operates at the instruction-scope. I.e., optimizations are applied local + /// to individual instructions. static bool optimizeIntrinsic(Instruction *I); + bool optimizeIntrinsicCalls(SmallSetVector &Functions); + /// Operates at the function-scope. I.e., optimizations are applied local to + /// the functions themselves. bool optimizeFunctions(SmallSetVector &Functions); static bool optimizeConvertFromSVBool(IntrinsicInst *I); @@ -95,6 +110,188 @@ return I; } +/// Checks if a ptrue intrinsic call is promoted. The act of promoting a +/// ptrue will introduce zeroing. For example: +/// +/// %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +/// %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +/// %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +/// +/// %1 is promoted, because it is converted: +/// +/// => => +/// +/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool. +bool isPTruePromoted(IntrinsicInst *PTrue) { + // Find all users of this intrinsic that are calls to convert-to-svbool + // reinterpret intrinsics. + SmallVector ConvertToUses; + for (User *User : PTrue->users()) { + if (match(User, m_Intrinsic())) { + ConvertToUses.push_back(cast(User)); + } + } + + // If no such calls were found, this is ptrue is not promoted. + if (ConvertToUses.empty()) + return false; + + // Otherwise, try to find users of the convert-to-svbool intrinsics that are + // calls to the convert-from-svbool intrinsic, and would result in some lanes + // being zeroed. + const auto *PTrueVTy = cast(PTrue->getType()); + for (IntrinsicInst *ConvertToUse : ConvertToUses) { + for (User *User : ConvertToUse->users()) { + auto *IntrUser = dyn_cast(User); + if (IntrUser && IntrUser->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_from_svbool) { + const auto *IntrUserVTy = cast(IntrUser->getType()); + + // Would some lanes become zeroed by the conversion? + if (IntrUserVTy->getElementCount().getKnownMinValue() > + PTrueVTy->getElementCount().getKnownMinValue()) + // This is a promoted ptrue. + return true; + } + } + } + + // If no matching calls were found, this is not a promoted ptrue. + return false; +} + +/// Attempts to coalesce ptrues in a basic block. +bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls( + BasicBlock &BB, SmallSetVector &PTrues) { + if (PTrues.size() <= 1) + return false; + + // Find the ptrue with the most lanes. + auto *MostEncompassingPTrue = *std::max_element( + PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) { + auto *PTrue1VTy = cast(PTrue1->getType()); + auto *PTrue2VTy = cast(PTrue2->getType()); + return PTrue1VTy->getElementCount().getKnownMinValue() < + PTrue2VTy->getElementCount().getKnownMinValue(); + }); + + // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving + // behind only the ptrues to be coalesced. + PTrues.remove(MostEncompassingPTrue); + PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); }); + + // Hoist MostEncompassingPTrue to the start of the basic block. It is always + // safe to do this, since ptrue intrinsic calls are guaranteed to have no + // predecessors. + MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt()); + + LLVMContext &Ctx = BB.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator()); + + auto *MostEncompassingPTrueVTy = + cast(MostEncompassingPTrue->getType()); + auto *ConvertToSVBool = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy}, + {MostEncompassingPTrue}); + + for (auto *PTrue : PTrues) { + auto *PTrueVTy = cast(PTrue->getType()); + + Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator()); + auto *ConvertFromSVBool = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, + {PTrueVTy}, {ConvertToSVBool}); + PTrue->replaceAllUsesWith(ConvertFromSVBool); + PTrue->eraseFromParent(); + } + + return true; +} + +/// The goal of this function is to remove redundant calls to the SVE ptrue +/// intrinsic in each basic block within the given functions. +/// +/// SVE ptrues have two representations in LLVM IR: +/// - a logical representation -- an arbitrary-width scalable vector of i1s, +/// i.e. . +/// - a physical representation (svbool, ) -- a 16-element +/// scalable vector of i1s, i.e. . +/// +/// The SVE ptrue intrinsic is used to create a logical representation of an SVE +/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If +/// P1 creates a logical SVE predicate that is at least as wide as the logical +/// SVE predicate created by P2, then all of the bits that are true in the +/// physical representation of P2 are necessarily also true in the physical +/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to +/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via +/// convert.{to,from}.svbool. +/// +/// Currently, this pass only coalesces calls to SVE ptrue intrinsics +/// if they match the following conditions: +/// +/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns. +/// SV_ALL indicates that all bits of the predicate vector are to be set to +/// true. SV_POW2 indicates that all bits of the predicate vector up to the +/// largest power-of-two are to be set to true. +/// - the result of the call to the intrinsic is not promoted to a wider +/// predicate. In this case, keeping the extra ptrue leads to better codegen +/// -- coalescing here would create an irreducible chain of SVE reinterprets +/// via convert.{to,from}.svbool. +/// +/// EXAMPLE: +/// +/// %1 = ptrue(i32 SV_ALL) +/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1> +/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0> +/// ... +/// +/// %2 = ptrue(i32 SV_ALL) +/// ; Logical: <1, 1, 1, 1> +/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0> +/// ... +/// +/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance: +/// +/// %1 = ptrue(i32 i31) +/// %2 = convert.to.svbool( %1) +/// %3 = convert.from.svbool( %2) +/// +bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls( + SmallSetVector &Functions) { + bool Changed = false; + + for (auto *F : Functions) { + for (auto &BB : *F) { + SmallSetVector SVAllPTrues; + SmallSetVector SVPow2PTrues; + + // For each basic block, collect the used ptrues and try to coalesce them. + for (Instruction &I : BB) { + if (I.use_empty()) + continue; + + auto *IntrI = dyn_cast(&I); + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + continue; + + const auto PTruePattern = + cast(IntrI->getOperand(0))->getZExtValue(); + + if (PTruePattern == AArch64SVEPredPattern::all) + SVAllPTrues.insert(IntrI); + if (PTruePattern == AArch64SVEPredPattern::pow2) + SVPow2PTrues.insert(IntrI); + } + + Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues); + Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues); + } + } + + return Changed; +} + /// The function will remove redundant reinterprets casting in the presence /// of the control flow bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { @@ -243,7 +440,7 @@ return true; } -bool SVEIntrinsicOpts::optimizeFunctions( +bool SVEIntrinsicOpts::optimizeIntrinsicCalls( SmallSetVector &Functions) { bool Changed = false; for (auto *F : Functions) { @@ -260,6 +457,16 @@ return Changed; } +bool SVEIntrinsicOpts::optimizeFunctions( + SmallSetVector &Functions) { + bool Changed = false; + + Changed |= optimizePTrueIntrinsicCalls(Functions); + Changed |= optimizeIntrinsicCalls(Functions); + + return Changed; +} + bool SVEIntrinsicOpts::runOnModule(Module &M) { bool Changed = false; SmallSetVector Functions; @@ -276,6 +483,7 @@ case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: + case Intrinsic::aarch64_sve_ptrue: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; diff --git a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) + +declare @llvm.aarch64.sve.ld1.nxv16i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv2i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1.nxv8i32(, i32*) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() + +; Two calls to the SVE ptrue intrinsic. %1 is redundant, and can be expressed as an SVE reinterpret of %3 via +; convert.{to,from}.svbool. +define @coalesce_test_basic(i32* %addr) { +; CHECK-LABEL: @coalesce_test_basic( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP5]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Two calls to the SVE ptrue intrinsic with the SV_POW2 pattern. This should reduce to the same output as +; coalesce_test_basic. +define @coalesce_test_pow2(i32* %addr) { +; CHECK-LABEL: @coalesce_test_pow2( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP5]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Four calls to the SVE ptrue intrinsic; two with the SV_ALL patterns, and two with the SV_POW2 pattern. The +; two SV_ALL ptrue intrinsics should be coalesced, and the two SV_POW2 intrinsics should be colaesced. +define @coalesce_test_all_and_pow2(i32* %addr) { +; CHECK-LABEL: @coalesce_test_all_and_pow2( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP6]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP4]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %2 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) + %3 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %4 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + + %5 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %6 = call @llvm.aarch64.sve.ld1.nxv8i32( %2, i32* %addr) + %7 = call @llvm.aarch64.sve.ld1.nxv4i32( %3, i32* %addr) + %8 = call @llvm.aarch64.sve.ld1.nxv8i32( %4, i32* %addr) + ret %8 +} + + +; Two calls to the SVE ptrue intrinsic: one with the SV_ALL pattern, another with the SV_POW2 pattern. The +; patterns are incompatible, so they should not be coalesced. +define @coalesce_test_pattern_mismatch2(i32* %addr) { +; CHECK-LABEL: @coalesce_test_pattern_mismatch2( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP4]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Two calls to the SVE ptrue intrinsic with the SV_VL1 pattern. This pattern is not currently recognised, so +; nothing should be done here. +define @coalesce_test_bad_pattern(i32* %addr) { +; CHECK-LABEL: @coalesce_test_bad_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP4]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Four calls to the SVE ptrue intrinsic. %7 is the most encompassing, and the others can be expressed as an +; SVE reinterprets of %7 via convert.{to,from}.svbool. +define @coalesce_test_multiple(i32* %addr) { +; CHECK-LABEL: @coalesce_test_multiple( +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv16i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[TMP2]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ld1.nxv2i32( [[TMP5]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP4]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.ld1.nxv16i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP9]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv2i32( %1, i32* %addr) + %3 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %3, i32* %addr) + %5 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %6 = call @llvm.aarch64.sve.ld1.nxv8i32( %5, i32* %addr) + %7 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %8 = call @llvm.aarch64.sve.ld1.nxv16i32( %7, i32* %addr) + ret %8 +} + +; Two calls to the SVE ptrue intrinsic which are both of the same size. In this case, one should be identified +; as redundant and rewritten and an SVE reinterpret of the other via the convert.{to,from}.svbool intrinsics. +; This introduces a redundant conversion which will then be eliminated. +define @coalesce_test_same_size(i32* %addr) { +; CHECK-LABEL: @coalesce_test_same_size( +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP3]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %3, i32* %addr) + ret %4 +} + +; Two calls to the SVE ptrue intrinsic, but neither can be eliminated; %1 is promoted to become %3, which +; means eliminating this call to the SVE ptrue intrinsic would involve creating a longer, irreducible chain of +; conversions. Better codegen is achieved by just leaving the ptrue as-is. +define @coalesce_test_promoted_ptrue(i32* %addr1, i16* %addr2) { +; CHECK-LABEL: @coalesce_test_promoted_ptrue( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i16( [[TMP5]], i16* [[ADDR2:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i16( [[TMP1]], i16* [[ADDR2]]) +; CHECK-NEXT: ret [[TMP8]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) + %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %2) + + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr1) + %5 = call @llvm.aarch64.sve.ld1.nxv8i16( %3, i16* %addr2) + + %6 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %7 = call @llvm.aarch64.sve.ld1.nxv8i16( %6, i16* %addr2) + ret %7 +}