diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -52,6 +52,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); @@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeSVEIntrinsicOptsPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -146,6 +146,11 @@ cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); +static cl::opt EnableSVEIntrinsicOpts( + "aarch64-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -182,6 +187,7 @@ initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); @@ -434,6 +440,10 @@ // ourselves. addPass(createAtomicExpandPass()); + // Expand any SVE vector library calls that we can't code generate directly. + if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVEIntrinsicOptsPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -64,6 +64,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp DEPENDS diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -0,0 +1,277 @@ +//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Performs general IR level optimizations on SVE intrinsics. +// +// The main goal of this pass is to remove unnecessary reinterpret +// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// +// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) +// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) +// +// This pass also looks for ptest intrinsics & phi instructions where the +// operands are being needlessly converted to and from svbool_t. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "sve-intrinsic-opts" + +namespace llvm { +void initializeSVEIntrinsicOptsPass(PassRegistry &); +} + +namespace { +struct SVEIntrinsicOpts : public ModulePass { + static char ID; // Pass identification, replacement for typeid + SVEIntrinsicOpts() : ModulePass(ID) { + initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + static IntrinsicInst *isReinterpretFromSVBool(Value *V); + static IntrinsicInst *isReinterpretToSVBool(Value *V); + + static bool optimizeIntrinsic(Instruction *I); + + bool optimizeFunctions(SmallSetVector &Functions); + + static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizePTest(IntrinsicInst *I); + + static bool processPhiNode(IntrinsicInst *I); +}; +} // end anonymous namespace + +void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); +} + +char SVEIntrinsicOpts::ID = 0; +static const char *name = "SVE intrinsics optimizations"; +INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) + +namespace llvm { +ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } +} // namespace llvm + +/// Returns V if it's a cast from (aka svbool_t), nullptr +/// otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { + IntrinsicInst *I = dyn_cast(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return nullptr; + + return I; +} + +/// Returns V if it's a cast to (aka svbool_t), nullptr otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) { + IntrinsicInst *I = dyn_cast(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool) + return nullptr; + + return I; +} + +/// The function will remove redundant reinterprets casting in the presence +/// of the control flow +bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { + + SmallVector Worklist; + auto RequiredType = X->getType(); + + auto *PN = dyn_cast(X->getArgOperand(0)); + assert(PN && "Expected Phi Node!"); + + // Don't create a new Phi unless we can remove the old one. + if (!PN->hasOneUse()) + return false; + + for (Value *IncValPhi : PN->incoming_values()) { + auto *Reinterpret = isReinterpretToSVBool(IncValPhi); + if (!Reinterpret || + RequiredType != Reinterpret->getArgOperand(0)->getType()) + return false; + } + + // Create the new Phi + LLVMContext &Ctx = PN->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(PN); + PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); + Worklist.push_back(PN); + + for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { + auto *Reinterpret = cast(PN->getIncomingValue(I)); + NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); + Worklist.push_back(Reinterpret); + } + + // Cleanup Phi Node and reinterprets + X->replaceAllUsesWith(NPN); + X->eraseFromParent(); + + for (auto &I : Worklist) + if (I->use_empty()) + I->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { + IntrinsicInst *Op1 = dyn_cast(I->getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast(I->getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + Module *M = I->getParent()->getParent()->getParent(); + + auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); + auto CI = CallInst::Create(Fn, Ops, I->getName(), I); + + I->replaceAllUsesWith(CI); + I->eraseFromParent(); + if (Op1->use_empty()) + Op1->eraseFromParent(); + if (Op2->use_empty()) + Op2->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { + assert(isReinterpretFromSVBool(I)); + + // If the reinterpret instruction operand is a PHI Node + if (isa(I->getArgOperand(0))) + return processPhiNode(I); + + // If we have a reinterpret intrinsic I of type A which is converting from + // another reinterpret Y of type B, and the source type of Y is A, then we can + // elide away both reinterprets if there are no other users of Y. + auto *Y = isReinterpretToSVBool(I->getArgOperand(0)); + if (!Y) + return false; + + Value *SourceVal = Y->getArgOperand(0); + if (I->getType() != SourceVal->getType()) + return false; + + I->replaceAllUsesWith(SourceVal); + I->eraseFromParent(); + if (Y->use_empty()) + Y->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { + IntrinsicInst *IntrI = dyn_cast(I); + if (!IntrI) + return false; + + switch (IntrI->getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return optimizePTest(IntrI); + default: + return false; + } + + return true; +} + +bool SVEIntrinsicOpts::optimizeFunctions( + SmallSetVector &Functions) { + bool Changed = false; + for (auto *F : Functions) { + DominatorTree *DT = &getAnalysis(*F).getDomTree(); + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal RPOT(Root); + for (auto *BB : RPOT) + for (Instruction &I : make_early_inc_range(*BB)) + Changed |= optimizeIntrinsic(&I); + } + return Changed; +} + +bool SVEIntrinsicOpts::runOnModule(Module &M) { + bool Changed = false; + SmallSetVector Functions; + + // Check for SVE intrinsic declarations first so that we only iterate over + // relevant functions. Where an appropriate declaration is found, store the + // function(s) where it is used so we can target these only. + for (auto &F : M.getFunctionList()) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + auto *Inst = dyn_cast(*I++); + Functions.insert(Inst->getFunction()); + } + break; + default: + break; + } + } + + if (!Functions.empty()) + Changed |= optimizeFunctions(Functions); + + return Changed; +} diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -18,6 +18,10 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: SVE intrinsics optimizations +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define i1 @ptest_any1( %a) { +; OPT-LABEL: ptest_any1 +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + ret i1 %out +} + +; No transform because the ptest is using differently sized operands. +define i1 @ptest_any2( %a) { +; OPT-LABEL: ptest_any2 +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; OPT-NEXT: %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) +; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) + ret i1 %out +} + +define i1 @ptest_first( %a) { +; OPT-LABEL: ptest_first +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1( %1, %2) + ret i1 %out +} + +define i1 @ptest_last( %a) { +; OPT-LABEL: ptest_last +; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1( %mask, %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %mask) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %a) + %out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1( %1, %2) + ret i1 %out +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(, ) +declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(, ) +declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(, ) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -0,0 +1,203 @@ +; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define @reinterpret_test_h( %a) { +; OPT-LABEL: @reinterpret_test_h( +; OPT-NOT: convert +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_h_rev( %a) { +; OPT-LABEL: @reinterpret_test_h_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) + ret %2 +} + +define @reinterpret_test_w( %a) { +; OPT-LABEL: @reinterpret_test_w( +; OPT-NOT: convert +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_w_rev( %a) { +; OPT-LABEL: @reinterpret_test_w_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) + ret %2 +} + +define @reinterpret_test_d( %a) { +; OPT-LABEL: @reinterpret_test_d( +; OPT-NOT: convert +; OPT: ret %a + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %1) + ret %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define @reinterpret_test_d_rev( %a) { +; OPT-LABEL: @reinterpret_test_d_rev( +; OPT: %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %a) +; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; OPT-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %a) + %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) + ret %2 +} + +define @reinterpret_reductions(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions +; OPT-NOT: convert +; OPT-NOT: phi +; OPT: phi [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ] +; OPT-NOT: convert +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1) +; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets. +define @reinterpret_reductions_1(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions_1 +; OPT: convert +; OPT: phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +; No transform. Similar to the the test above, but here only two of the arguments need to +; be converted to svbool. +define @reinterpret_reductions_2(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions_2 +; OPT: convert +; OPT: phi [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg1 +} + +; Similar to reinterpret_reductions but the reinterprets remain because the +; original phi cannot be removed (i.e. prefer reinterprets over multiple phis). +define @reinterpret_reductions3(i32 %cond, %a, %b, %c) { +; OPT-LABEL: reinterpret_reductions3 +; OPT: phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi +; OPT: tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) +; OPT-NEXT: ret %pg + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) + br label %join + +br_phi_b: + %b1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %b) + br label %join + +br_phi_c: + %c1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %c) + br label %join + +join: + %pg = phi [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + ret %pg +} + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1()