Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -52,6 +52,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +FunctionPass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); @@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeSVEIntrinsicOptsPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -146,6 +146,11 @@ cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); +static cl::opt EnableSVEIntrinsicOpts( + "aarch64-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -182,6 +187,7 @@ initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); @@ -431,6 +437,11 @@ // ourselves. addPass(createAtomicExpandPass()); + // Expand any SVE vector library calls that we can't code generate directly. + bool ExpandToOptimize = (TM->getOptLevel() != CodeGenOpt::None); + if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVEIntrinsicOptsPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -64,6 +64,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp DEPENDS Index: llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -0,0 +1,245 @@ +//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Performs general IR level optimizations on SVE intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "sve-intrinsicopts" + +namespace llvm { +void initializeSVEIntrinsicOptsPass(PassRegistry &); +} + +namespace { +struct SVEIntrinsicOpts : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + SVEIntrinsicOpts() : FunctionPass(ID) { + initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + static IntrinsicInst *isReinterpretFromSVBool(Value *V); + static IntrinsicInst *isReinterpretToSVBool(Value *V); + + static bool optimizeBlock(BasicBlock *BB); + static bool optimizeIntrinsic(Instruction *I); + + static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizePTest(IntrinsicInst *I); + + static bool processPhiNode(Instruction *I); + + DominatorTree *DT; +}; +} // end anonymous namespace + +void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); +} + +char SVEIntrinsicOpts::ID = 0; +static const char *name = "SVE intrinsics optimizations"; +INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) + +namespace llvm { +FunctionPass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } +} // namespace llvm + +/// Returns V if it's a cast from (aka svbool_t), nullptr +/// otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { + IntrinsicInst *I = dyn_cast(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return nullptr; + + return I; +} + +/// Returns V if it's a cast to (aka svbool_t), nullptr otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) { + IntrinsicInst *I = dyn_cast(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool) + return nullptr; + + return I; +} + +/// The function will remove redundant reinterprets casting in the presence +/// of the control flow +bool SVEIntrinsicOpts::processPhiNode(Instruction *X) { + + SmallVector Worklist; + auto RequiredType = X->getType(); + + auto *PN = dyn_cast(X->getOperand(0)); + if (!PN) + return false; + + // Don't create a new Phi unless we can remove the old one. + if (!PN->hasOneUse()) + return false; + + for (Value *IncValPhi : PN->incoming_values()) { + auto *Reinterpret = isReinterpretToSVBool(IncValPhi); + if (!Reinterpret || + RequiredType != Reinterpret->getArgOperand(0)->getType()) + return false; + } + + // Create the new Phi + LLVMContext &C1 = PN->getContext(); + IRBuilder<> Builder(C1); + Builder.SetInsertPoint(PN); + PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); + Worklist.push_back(PN); + + for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { + auto *Reinterpret = cast(PN->getIncomingValue(i)); + NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(i)); + Worklist.push_back(Reinterpret); + } + + // Cleanup Phi Node and reinterprets + X->replaceAllUsesWith(NPN); + X->eraseFromParent(); + + for (auto &I : Worklist) + if (I->use_empty()) + I->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { + IntrinsicInst *Op1 = dyn_cast(I->getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast(I->getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + Module *M = I->getParent()->getParent()->getParent(); + + auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); + auto CI = CallInst::Create(Fn, Ops, I->getName(), I); + + I->replaceAllUsesWith(CI); + I->eraseFromParent(); + if (Op1->use_empty()) + Op1->eraseFromParent(); + if (Op2->use_empty()) + Op2->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { + assert(isReinterpretFromSVBool(I)); + + // If the reinterpret instruction operand is a PHI Node + if (isa(I->getArgOperand(0))) + return processPhiNode(I); + + // If we have a reinterpret intrinsic I of type A which is converting from + // another reinterpret Y of type B, and the source type of Y is A, then we can + // elide away both reinterprets if there are no other users of Y. + if (auto *Y = isReinterpretToSVBool(I->getArgOperand(0))) { + Value *SourceVal = Y->getArgOperand(0); + if (I->getType() != SourceVal->getType()) + return false; + + I->replaceAllUsesWith(SourceVal); + I->eraseFromParent(); + if (Y->use_empty()) + Y->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { + IntrinsicInst *IntrI = dyn_cast(I); + if (!IntrI) + return false; + + switch (IntrI->getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return optimizePTest(IntrI); + default: + return false; + } + + return true; +} + +bool SVEIntrinsicOpts::optimizeBlock(BasicBlock *BB) { + bool Changed = false; + for (auto II = BB->begin(), IE = BB->end(); II != IE;) { + Instruction *I = &(*II); + II = std::next(II); + Changed |= optimizeIntrinsic(I); + } + return Changed; +} + +bool SVEIntrinsicOpts::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); + bool Changed = false; + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal RPOT(Root); + for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { + Changed |= optimizeBlock(*I); + } + + return Changed; +} Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -17,6 +17,8 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -sve-intrinsicopts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define i1 @ptest_any1( %a) { +; OPT-LABEL: ptest_any1 +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + ret i1 %out +} + +; No transform because the ptest is using differently sized operands. +define i1 @ptest_any2( %a) { +; OPT-LABEL: ptest_any2 +; OPT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + ret i1 %out +} + +define i1 @ptest_first( %a) { +; OPT-LABEL: ptest_first +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1( %1, %2) + ret i1 %out +} + +define i1 @ptest_last( %a) { +; OPT-LABEL: ptest_last +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1( %1, %2) + ret i1 %out +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(, ) +declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(, ) +declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(, ) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -0,0 +1,196 @@ +; RUN: opt -S -sve-intrinsicopts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define @reinterpret_test_h( %a) { +; OPT-LABEL: @reinterpret_test_h( +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_h_rev( %a) { +; OPT-LABEL: @reinterpret_test_h_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) + ret %2 +} + +define @reinterpret_test_w( %a) { +; OPT-LABEL: @reinterpret_test_w( +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_w_rev( %a) { +; OPT-LABEL: @reinterpret_test_w_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) + ret %2 +} + +define @reinterpret_test_d( %a) { +; OPT-LABEL: @reinterpret_test_d( +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_d_rev( %a) { +; OPT-LABEL: @reinterpret_test_d_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) + ret %2 +} + +define @reinterpret_reductions(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions +; OPT-NOT: convert +; OPT-NOT: phi +; OPT: phi [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ] +; OPT-NOT: convert +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +define @reinterpret_reductions_1(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions_1 +; OPT: convert +; OPT: phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +define @reinterpret_reductions_2(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions_2 +; OPT: convert +; OPT: phi [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +; Similar to reinterpret_reductions but the reinterprets remain because the +; original phi cannot be removed (i.e. prefer reinterprets over multiple phis). +define @reinterpret_reductions3(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions3 +; OPT: phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT-NEXT: ret %pg + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg +} + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1()