diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -54,6 +54,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +ModulePass *createSVECoalescePTrueIntrinsicsPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -91,6 +92,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeSVEIntrinsicOptsPass(PassRegistry&); +void initializeSVECoalescePTrueIntrinsicsPass(PassRegistry &); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -153,6 +153,10 @@ cl::desc("Enable SVE intrinsic opts"), cl::init(true)); +static cl::opt EnableSVECoalescePTrueIntrinsics( + "aarch64-enable-sve-coalesce-ptrue-intrinsics", cl::Hidden, + cl::desc("Enable the SVE coalesce ptrue intrinsics pass"), cl::init(true)); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -192,6 +196,7 @@ initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVECoalescePTrueIntrinsicsPass(*PR); initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64SLSHardeningPass(*PR); @@ -452,6 +457,11 @@ // ourselves. addPass(createAtomicExpandPass()); + // Coalesce ptrue intrinsics. + if (EnableSVECoalescePTrueIntrinsics && + TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVECoalescePTrueIntrinsicsPass()); + // Expand any SVE vector library calls that we can't code generate directly. if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) addPass(createSVEIntrinsicOptsPass()); diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -75,6 +75,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + SVECoalescePTrueIntrinsics.cpp SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp diff --git a/llvm/lib/Target/AArch64/SVECoalescePTrueIntrinsics.cpp b/llvm/lib/Target/AArch64/SVECoalescePTrueIntrinsics.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/SVECoalescePTrueIntrinsics.cpp @@ -0,0 +1,268 @@ +//===- SVECoalescePTrueIntrinsics - Eliminate Redundant SVE PTrue Calls ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The goal of this pass is to remove redundant calls to the SVE ptrue +// intrinsic in each basic block. +// +// SVE ptrues have two representations in LLVM IR: +// - a logical representation -- an arbitrary-width scalable vector of i1s, i.e. +// . +// - a physical representation (svbool, ) -- a 16-element +// scalable vector of i1s, i.e. . +// +// The SVE ptrue intrinsic is used to create a logical representation of an SVE +// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If +// P1 creates a logical SVE predicate that is at least as wide as the logical +// SVE predicate created by P2, then all of the bits that are true in the +// physical representation of P2 are necessarily also true in the physical +// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to +// P2 is redundant and can be replaced by an SVE reinterpret of P1 via +// convert.{to,from}.svbool. +// +// Currently, this pass only coalesces calls to SVE ptrue intrinsics +// if they match the following conditions: +// +// - the call to the intrinsic uses the pattern SV_ALL, indicating that all bits +// of the predicate vector are to be set to true. +// - the result of the call to the intrinsic is not promoted to a wider +// predicate. In this case, keeping the extra ptrue leads to better codegen -- +// coalescing here would create an irreducible chain of SVE reinterprets via +// convert.{to,from}.svbool. +// +// EXAMPLE: +// +// %1 = ptrue(i32 SV_ALL) +// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1> +// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0> +// ... +// +// %2 = ptrue(i32 SV_ALL) +// ; Logical: <1, 1, 1, 1> +// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0> +// ... +// +// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance: +// +// %1 = ptrue(i32 i31) +// %2 = convert.to.svbool( %1) +// %3 = convert.from.svbool( %2) +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "aarch64-sve-coalesce-ptrue-intrinsics" + +namespace llvm { +void initializeSVECoalescePTrueIntrinsicsPass(PassRegistry &); +} + +namespace { +struct SVECoalescePTrueIntrinsics : public ModulePass { + static char ID; // Pass identification, replacement for typeid + SVECoalescePTrueIntrinsics() : ModulePass(ID) { + initializeSVECoalescePTrueIntrinsicsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool coalescePTrueIntrinsics(BasicBlock &BB, + SmallSetVector &PTrues); + bool optimizeFunctions(SmallSetVector &Functions); +}; +} // end anonymous namespace + +void SVECoalescePTrueIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +char SVECoalescePTrueIntrinsics::ID = 0; +static const char *name = "SVE coalesce ptrue intrinsics"; +INITIALIZE_PASS_BEGIN(SVECoalescePTrueIntrinsics, DEBUG_TYPE, name, false, + false) +INITIALIZE_PASS_END(SVECoalescePTrueIntrinsics, DEBUG_TYPE, name, false, false) + +namespace llvm { +ModulePass *createSVECoalescePTrueIntrinsicsPass() { + return new SVECoalescePTrueIntrinsics(); +} +} // namespace llvm + +/// Checks if a ptrue intrinsic call is promoted. The act of promoting a +/// ptrue will introduce zeroing. For example: +/// +/// %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +/// %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +/// %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +/// +/// %1 is promoted, because it is converted: +/// +/// => => +/// +/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool. +bool isPTruePromoted(IntrinsicInst *PTrue) { + // Find all users of this intrinsic that are calls to convert-to-svbool + // reinterpret intrinsics. + SmallVector ConvertToUses; + for (User *User : PTrue->users()) { + auto *IntrUser = dyn_cast(User); + if (IntrUser && IntrUser->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_to_svbool) { + ConvertToUses.push_back(IntrUser); + } + } + + // If no such calls were found, this is ptrue is not promoted. + if (ConvertToUses.empty()) + return false; + + // Otherwise, try to find users of the convert-to-svbool intrinsics that are + // calls to the convert-from-svbool intrinsic, and would result in some lanes + // being zeroed. + const auto *PTrueVTy = cast(PTrue->getType()); + for (IntrinsicInst *ConvertToUse : ConvertToUses) { + for (User *User : ConvertToUse->users()) { + auto *IntrUser = dyn_cast(User); + if (IntrUser && IntrUser->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_from_svbool) { + const auto *IntrUserVTy = cast(IntrUser->getType()); + + // Would some lanes become zeroed by the conversion? + if (IntrUserVTy->getElementCount().getKnownMinValue() > + PTrueVTy->getElementCount().getKnownMinValue()) + // This is a promoted ptrue. + return true; + } + } + } + + // If no matching calls were found, this is not a promoted ptrue. + return false; +} + +/// Attempts to coalesce ptrues in a basic block. +bool SVECoalescePTrueIntrinsics::coalescePTrueIntrinsics( + BasicBlock &BB, SmallSetVector &PTrues) { + if (PTrues.size() <= 1) + return false; + + // Find the ptrue with the most lanes. + auto *MostEncompassingPTrue = *std::max_element( + PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) { + auto *PTrue1VTy = cast(PTrue1->getType()); + auto *PTrue2VTy = cast(PTrue2->getType()); + return PTrue1VTy->getElementCount().getKnownMinValue() < + PTrue2VTy->getElementCount().getKnownMinValue(); + }); + + // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving + // behind only the ptrues to be coalesced. + PTrues.remove(MostEncompassingPTrue); + PTrues.remove_if([&](auto *PTrue) { return isPTruePromoted(PTrue); }); + + // Hoist MostEncompassingPTrue to the start of the basic block. It is always + // safe to do this, since ptrue intrinsic calls are guaranteed to have no + // predecessors. + MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt()); + + LLVMContext &Ctx = BB.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator()); + + auto *MostEncompassingPTrueVTy = + cast(MostEncompassingPTrue->getType()); + auto *ConvertToSVBool = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy}, + {MostEncompassingPTrue}); + + for (auto *PTrue : PTrues) { + auto *PTrueVTy = cast(PTrue->getType()); + + Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator()); + auto *ConvertFromSVBool = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, + {PTrueVTy}, {ConvertToSVBool}); + PTrue->replaceAllUsesWith(ConvertFromSVBool); + PTrue->eraseFromParent(); + } + + return true; +} + +bool SVECoalescePTrueIntrinsics::optimizeFunctions( + SmallSetVector &Functions) { + bool Changed = false; + for (auto *F : Functions) { + for (auto &BB : *F) { + SmallSetVector PTrues; + + // For each basic block, collect the used ptrues and try to coalesce them. + for (Instruction &I : BB) { + if (I.use_empty()) + continue; + + auto *IntrI = dyn_cast(&I); + if (!(IntrI && IntrI->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)) + continue; + + const auto PTruePattern = + cast(IntrI->getOperand(0))->getZExtValue(); + + if (PTruePattern == AArch64SVEPredPattern::all) + PTrues.insert(IntrI); + } + + Changed |= coalescePTrueIntrinsics(BB, PTrues); + } + } + + return Changed; +} + +bool SVECoalescePTrueIntrinsics::runOnModule(Module &M) { + bool Changed = false; + SmallSetVector Functions; + + // Check for SVE intrinsic declarations first, and store the functions where + // they are used in a set so that we only iterate over relevant functions + // once. + for (auto &F : M.getFunctionList()) { + if (!F.isDeclaration() || + F.getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + continue; + + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + auto *Inst = dyn_cast(*I++); + Functions.insert(Inst->getFunction()); + } + } + + if (!Functions.empty()) + Changed |= optimizeFunctions(Functions); + + return Changed; +} diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: SVE coalesce ptrue intrinsics ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-coalesce-ptrue-intrinsics -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) + +declare @llvm.aarch64.sve.ld1.nxv16i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv2i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1.nxv8i32(, i32*) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() + +; Two calls to the SVE ptrue intrinsic. %1 is redundant, and can be expressed as an SVE reinterpret of %3 via +; convert.{to,from}.svbool. +define @coalesce_test_basic(i32* %addr) { +; CHECK-LABEL: @coalesce_test_basic( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP5]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Two calls to the SVE ptrue intrinsic with the SV_VL1 pattern. This pattern is not currently recognised by +; the aarch64-sve-coalesce-ptrue-intrinsics pass, so nothing should be done here. +define @coalesce_test_bad_pattern(i32* %addr) { +; CHECK-LABEL: @coalesce_test_bad_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP4]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) + %4 = call @llvm.aarch64.sve.ld1.nxv8i32( %3, i32* %addr) + ret %4 +} + +; Four calls to the SVE ptrue intrinsic. %7 is the most encompassing, and the others can be expressed as an +; SVE reinterprets of %7 via convert.{to,from}.svbool. +define @coalesce_test_multiple(i32* %addr) { +; CHECK-LABEL: @coalesce_test_multiple( +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv16i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[TMP2]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ld1.nxv2i32( [[TMP5]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP4]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.aarch64.sve.ld1.nxv16i32( [[TMP1]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP9]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv2i32( %1, i32* %addr) + %3 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %3, i32* %addr) + %5 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %6 = call @llvm.aarch64.sve.ld1.nxv8i32( %5, i32* %addr) + %7 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %8 = call @llvm.aarch64.sve.ld1.nxv16i32( %7, i32* %addr) + ret %8 +} + +; Two calls to the SVE ptrue intrinsic which are both of the same size. In this case, one should be identified +; as redundant and rewritten and an SVE reinterpret of the other via the convert.{to,from}.svbool intrinsics. +; This introduces a redundant conversion which will later be eliminated by the aarch64-sve-intrinsics-opts +; pass. +define @coalesce_test_same_size(i32* %addr) { +; CHECK-LABEL: @coalesce_test_same_size( +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP1]], i32* [[ADDR:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR]]) +; CHECK-NEXT: ret [[TMP5]] +; + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr) + %3 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %3, i32* %addr) + ret %4 +} + +; Two calls to the SVE ptrue intrinsic, but neither can be eliminated; %1 is promoted to become %3, which +; means eliminating this call to the SVE ptrue intrinsic would involve creating a longer, irreducible chain of +; conversions. Better codegen is achieved by just leaving the ptrue as-is. +define @coalesce_test_promoted_ptrue(i32* %addr1, i16* %addr2) { +; CHECK-LABEL: @coalesce_test_promoted_ptrue( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ld1.nxv4i32( [[TMP3]], i32* [[ADDR1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i16( [[TMP5]], i16* [[ADDR2:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ld1.nxv8i16( [[TMP1]], i16* [[ADDR2]]) +; CHECK-NEXT: ret [[TMP8]] +; + %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) + %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %2) + + %4 = call @llvm.aarch64.sve.ld1.nxv4i32( %1, i32* %addr1) + %5 = call @llvm.aarch64.sve.ld1.nxv8i16( %3, i16* %addr2) + + %6 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %7 = call @llvm.aarch64.sve.ld1.nxv8i16( %6, i16* %addr2) + ret %7 +}