diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -682,6 +682,115 @@ : None; } +static Optional instCombinePtest(InstCombiner &IC, + IntrinsicInst &II) { + IntrinsicInst *Op1 = dyn_cast(II.getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast(II.getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + + auto *Ptest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); + + Ptest->takeName(&II); + return IC.replaceInstUsesWith(II, Ptest); + } + + return None; +} + +static Optional instCombineVectorMul(InstCombiner &IC, + IntrinsicInst &II) { + auto *OpPredicate = II.getOperand(0); + auto *OpMultiplicand = II.getOperand(1); + auto *OpMultiplier = II.getOperand(2); + + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + + // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call + // with a unit splat value, false otherwise. + auto IsUnitDupX = [](auto *I) { + auto *IntrI = dyn_cast(I); + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return false; + + auto *SplatValue = IntrI->getOperand(0); + return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); + }; + + // Return true if a given instruction is an aarch64_sve_dup intrinsic call + // with a unit splat value, false otherwise. + auto IsUnitDup = [](auto *I) { + auto *IntrI = dyn_cast(I); + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) + return false; + + auto *SplatValue = IntrI->getOperand(2); + return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); + }; + + // The OpMultiplier variable should always point to the dup (if any), so + // swap if necessary. + if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) + std::swap(OpMultiplier, OpMultiplicand); + + if (IsUnitDupX(OpMultiplier)) { + // [f]mul pg (dupx 1) %n => %n + OpMultiplicand->takeName(&II); + return IC.replaceInstUsesWith(II, OpMultiplicand); + } else if (IsUnitDup(OpMultiplier)) { + // [f]mul pg (dup pg 1) %n => %n + auto *DupInst = cast(OpMultiplier); + auto *DupPg = DupInst->getOperand(1); + // TODO: this is naive. The optimization is still valid if DupPg + // 'encompasses' OpPredicate, not only if they're the same predicate. + if (OpPredicate == DupPg) { + OpMultiplicand->takeName(&II); + return IC.replaceInstUsesWith(II, OpMultiplicand); + } + } + + return None; +} + +static Optional instCombineTBL(InstCombiner &IC, + IntrinsicInst &II) { + auto *OpVal = II.getOperand(0); + auto *OpIndices = II.getOperand(1); + VectorType *VTy = cast(II.getType()); + + // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with + // constant splat value < minimal element count of result. + auto *DupXIntrI = dyn_cast(OpIndices); + if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return None; + + auto *SplatValue = dyn_cast(DupXIntrI->getOperand(0)); + if (!SplatValue || + SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) + return None; + + // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to + // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); + auto *VectorSplat = + Builder.CreateVectorSplat(VTy->getElementCount(), Extract); + + VectorSplat->takeName(&II); + return IC.replaceInstUsesWith(II, VectorSplat); +} + Optional AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -709,6 +818,15 @@ return instCombineSVECntElts(IC, II, 8); case Intrinsic::aarch64_sve_cntb: return instCombineSVECntElts(IC, II, 16); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return instCombinePtest(IC, II); + case Intrinsic::aarch64_sve_mul: + case Intrinsic::aarch64_sve_fmul: + return instCombineVectorMul(IC, II); + case Intrinsic::aarch64_sve_tbl: + return instCombineTBL(IC, II); } return None; diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -60,18 +60,9 @@ SmallSetVector &PTrues); bool optimizePTrueIntrinsicCalls(SmallSetVector &Functions); - /// Operates at the instruction-scope. I.e., optimizations are applied local - /// to individual instructions. - static bool optimizeIntrinsic(Instruction *I); - bool optimizeIntrinsicCalls(SmallSetVector &Functions); - /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. bool optimizeFunctions(SmallSetVector &Functions); - - static bool optimizePTest(IntrinsicInst *I); - static bool optimizeVectorMul(IntrinsicInst *I); - static bool optimizeTBL(IntrinsicInst *I); }; } // end anonymous namespace @@ -285,185 +276,11 @@ return Changed; } -bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { - IntrinsicInst *Op1 = dyn_cast(I->getArgOperand(0)); - IntrinsicInst *Op2 = dyn_cast(I->getArgOperand(1)); - - if (Op1 && Op2 && - Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && - Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && - Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { - - Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; - Type *Tys[] = {Op1->getArgOperand(0)->getType()}; - Module *M = I->getParent()->getParent()->getParent(); - - auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); - auto CI = CallInst::Create(Fn, Ops, I->getName(), I); - - I->replaceAllUsesWith(CI); - I->eraseFromParent(); - if (Op1->use_empty()) - Op1->eraseFromParent(); - if (Op1 != Op2 && Op2->use_empty()) - Op2->eraseFromParent(); - - return true; - } - - return false; -} - -bool SVEIntrinsicOpts::optimizeVectorMul(IntrinsicInst *I) { - assert((I->getIntrinsicID() == Intrinsic::aarch64_sve_mul || - I->getIntrinsicID() == Intrinsic::aarch64_sve_fmul) && - "Unexpected opcode"); - - auto *OpPredicate = I->getOperand(0); - auto *OpMultiplicand = I->getOperand(1); - auto *OpMultiplier = I->getOperand(2); - - // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call - // with a unit splat value, false otherwise. - auto IsUnitDupX = [](auto *I) { - auto *IntrI = dyn_cast(I); - if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) - return false; - - auto *SplatValue = IntrI->getOperand(0); - return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); - }; - - // Return true if a given instruction is an aarch64_sve_dup intrinsic call - // with a unit splat value, false otherwise. - auto IsUnitDup = [](auto *I) { - auto *IntrI = dyn_cast(I); - if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) - return false; - - auto *SplatValue = IntrI->getOperand(2); - return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); - }; - - bool Changed = true; - - // The OpMultiplier variable should always point to the dup (if any), so - // swap if necessary. - if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) - std::swap(OpMultiplier, OpMultiplicand); - - if (IsUnitDupX(OpMultiplier)) { - // [f]mul pg (dupx 1) %n => %n - I->replaceAllUsesWith(OpMultiplicand); - I->eraseFromParent(); - Changed = true; - } else if (IsUnitDup(OpMultiplier)) { - // [f]mul pg (dup pg 1) %n => %n - auto *DupInst = cast(OpMultiplier); - auto *DupPg = DupInst->getOperand(1); - // TODO: this is naive. The optimization is still valid if DupPg - // 'encompasses' OpPredicate, not only if they're the same predicate. - if (OpPredicate == DupPg) { - I->replaceAllUsesWith(OpMultiplicand); - I->eraseFromParent(); - Changed = true; - } - } - - // If an instruction was optimized out then it is possible that some dangling - // instructions are left. - if (Changed) { - auto *OpPredicateInst = dyn_cast(OpPredicate); - auto *OpMultiplierInst = dyn_cast(OpMultiplier); - if (OpMultiplierInst && OpMultiplierInst->use_empty()) - OpMultiplierInst->eraseFromParent(); - if (OpPredicateInst && OpPredicateInst->use_empty()) - OpPredicateInst->eraseFromParent(); - } - - return Changed; -} - -bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) { - assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_tbl && - "Unexpected opcode"); - - auto *OpVal = I->getOperand(0); - auto *OpIndices = I->getOperand(1); - VectorType *VTy = cast(I->getType()); - - // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with - // constant splat value < minimal element count of result. - auto *DupXIntrI = dyn_cast(OpIndices); - if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) - return false; - - auto *SplatValue = dyn_cast(DupXIntrI->getOperand(0)); - if (!SplatValue || - SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) - return false; - - // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to - // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. - LLVMContext &Ctx = I->getContext(); - IRBuilder<> Builder(Ctx); - Builder.SetInsertPoint(I); - auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); - auto *VectorSplat = - Builder.CreateVectorSplat(VTy->getElementCount(), Extract); - - I->replaceAllUsesWith(VectorSplat); - I->eraseFromParent(); - if (DupXIntrI->use_empty()) - DupXIntrI->eraseFromParent(); - return true; -} - -bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { - IntrinsicInst *IntrI = dyn_cast(I); - if (!IntrI) - return false; - - switch (IntrI->getIntrinsicID()) { - case Intrinsic::aarch64_sve_fmul: - case Intrinsic::aarch64_sve_mul: - return optimizeVectorMul(IntrI); - case Intrinsic::aarch64_sve_ptest_any: - case Intrinsic::aarch64_sve_ptest_first: - case Intrinsic::aarch64_sve_ptest_last: - return optimizePTest(IntrI); - case Intrinsic::aarch64_sve_tbl: - return optimizeTBL(IntrI); - default: - return false; - } - - return true; -} - -bool SVEIntrinsicOpts::optimizeIntrinsicCalls( - SmallSetVector &Functions) { - bool Changed = false; - for (auto *F : Functions) { - DominatorTree *DT = &getAnalysis(*F).getDomTree(); - - // Traverse the DT with an rpo walk so we see defs before uses, allowing - // simplification to be done incrementally. - BasicBlock *Root = DT->getRoot(); - ReversePostOrderTraversal RPOT(Root); - for (auto *BB : RPOT) - for (Instruction &I : make_early_inc_range(*BB)) - Changed |= optimizeIntrinsic(&I); - } - return Changed; -} - bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; Changed |= optimizePTrueIntrinsicCalls(Functions); - Changed |= optimizeIntrinsicCalls(Functions); return Changed; } @@ -480,13 +297,7 @@ continue; switch (F.getIntrinsicID()) { - case Intrinsic::aarch64_sve_ptest_any: - case Intrinsic::aarch64_sve_ptest_first: - case Intrinsic::aarch64_sve_ptest_last: case Intrinsic::aarch64_sve_ptrue: - case Intrinsic::aarch64_sve_mul: - case Intrinsic::aarch64_sve_fmul: - case Intrinsic::aarch64_sve_tbl: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; diff --git a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll rename from llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll --- a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll @@ -1,8 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" ; Idempotent fmuls -- should compile to just a ret. -define @idempotent_fmul_f16( %pg, %a) { +define @idempotent_fmul_f16( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_fmul_f16( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -11,7 +12,7 @@ ret %2 } -define @idempotent_fmul_f32( %pg, %a) { +define @idempotent_fmul_f32( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_fmul_f32( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -20,7 +21,7 @@ ret %2 } -define @idempotent_fmul_f64( %pg, %a) { +define @idempotent_fmul_f64( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_fmul_f64( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -29,7 +30,7 @@ ret %2 } -define @idempotent_fmul_different_argument_order( %pg, %a) { +define @idempotent_fmul_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_fmul_different_argument_order( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -39,7 +40,7 @@ ret %2 } -define @idempotent_fmul_with_predicated_dup( %pg, %a) { +define @idempotent_fmul_with_predicated_dup( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_fmul_with_predicated_dup( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -48,7 +49,7 @@ ret %2 } -define @idempotent_fmul_two_dups( %pg, %a) { +define @idempotent_fmul_two_dups( %pg, %a) #0 { ; Edge case -- make sure that the case where we're fmultiplying two dups ; together is sane. ; CHECK-LABEL: @idempotent_fmul_two_dups( @@ -62,7 +63,7 @@ } ; Non-idempotent fmuls -- we don't expect these to be optimised out. -define @non_idempotent_fmul_f16( %pg, %a) { +define @non_idempotent_fmul_f16( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f16( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH4000) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv8f16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -73,7 +74,7 @@ ret %2 } -define @non_idempotent_fmul_f32( %pg, %a) { +define @non_idempotent_fmul_f32( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f32( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.000000e+00) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv4f32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -84,7 +85,7 @@ ret %2 } -define @non_idempotent_fmul_f64( %pg, %a) { +define @non_idempotent_fmul_f64( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f64( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.000000e+00) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -95,7 +96,7 @@ ret %2 } -define @non_idempotent_fmul_with_predicated_dup( %pg1, %pg2, %a) { +define @non_idempotent_fmul_with_predicated_dup( %pg1, %pg2, %a) #0 { ; Different predicates ; CHECK-LABEL: @non_idempotent_fmul_with_predicated_dup( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2f64( undef, [[PG1:%.*]], double 1.000000e+00) @@ -117,3 +118,5 @@ declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) declare @llvm.aarch64.sve.fmul.nxv2f64(, , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll rename from llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll --- a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll @@ -1,8 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" ; Idempotent muls -- should compile to just a ret. -define @idempotent_mul_i16( %pg, %a) { +define @idempotent_mul_i16( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_mul_i16( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -11,7 +12,7 @@ ret %2 } -define @idempotent_mul_i32( %pg, %a) { +define @idempotent_mul_i32( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_mul_i32( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -20,7 +21,7 @@ ret %2 } -define @idempotent_mul_i64( %pg, %a) { +define @idempotent_mul_i64( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_mul_i64( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -29,7 +30,7 @@ ret %2 } -define @idempotent_mul_different_argument_order( %pg, %a) { +define @idempotent_mul_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_mul_different_argument_order( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -39,7 +40,7 @@ ret %2 } -define @idempotent_mul_with_predicated_dup( %pg, %a) { +define @idempotent_mul_with_predicated_dup( %pg, %a) #0 { ; CHECK-LABEL: @idempotent_mul_with_predicated_dup( ; CHECK-NEXT: ret [[A:%.*]] ; @@ -48,7 +49,7 @@ ret %2 } -define @idempotent_mul_two_dups( %pg, %a) { +define @idempotent_mul_two_dups( %pg, %a) #0 { ; Edge case -- make sure that the case where we're multiplying two dups ; together is sane. ; CHECK-LABEL: @idempotent_mul_two_dups( @@ -62,7 +63,7 @@ } ; Non-idempotent muls -- we don't expect these to be optimised out. -define @non_idempotent_mul_i16( %pg, %a) { +define @non_idempotent_mul_i16( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i16( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 2) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv8i16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -73,7 +74,7 @@ ret %2 } -define @non_idempotent_mul_i32( %pg, %a) { +define @non_idempotent_mul_i32( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i32( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 2) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -84,7 +85,7 @@ ret %2 } -define @non_idempotent_mul_i64( %pg, %a) { +define @non_idempotent_mul_i64( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i64( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 2) ; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) @@ -95,7 +96,7 @@ ret %2 } -define @non_idempotent_mul_with_predicated_dup( %pg1, %pg2, %a) { +define @non_idempotent_mul_with_predicated_dup( %pg1, %pg2, %a) #0 { ; Different predicates ; CHECK-LABEL: @non_idempotent_mul_with_predicated_dup( ; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2i64( undef, [[PG1:%.*]], i64 1) @@ -117,3 +118,5 @@ declare @llvm.aarch64.sve.mul.nxv8i16(, , ) declare @llvm.aarch64.sve.mul.nxv4i32(, , ) declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll rename from llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll --- a/llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll @@ -1,9 +1,10 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" ; op2 = tbl(op1 dup_x(idx)) -> op2 = vector_splat(extractelement(op1, idx)) -define @dup_ext_i8( %data) { +define @dup_ext_i8( %data) #0 { ; CHECK-LABEL: @dup_ext_i8( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i8 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP1]], i32 0 @@ -15,7 +16,7 @@ ret %out } -define @dup_ext_i16( %data) { +define @dup_ext_i16( %data) #0 { ; CHECK-LABEL: @dup_ext_i16( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i32 0 @@ -27,7 +28,7 @@ ret %out } -define @dup_ext_i32( %data) { +define @dup_ext_i32( %data) #0 { ; CHECK-LABEL: @dup_ext_i32( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i32 0 @@ -39,7 +40,7 @@ ret %out } -define @dup_ext_i64( %data) { +define @dup_ext_i64( %data) #0 { ; CHECK-LABEL: @dup_ext_i64( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP1]], i32 0 @@ -51,7 +52,7 @@ ret %out } -define @dup_ext_f16( %data) { +define @dup_ext_f16( %data) #0 { ; CHECK-LABEL: @dup_ext_f16( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[TMP1]], i32 0 @@ -63,7 +64,7 @@ ret %out } -define @dup_ext_f32( %data) { +define @dup_ext_f32( %data) #0 { ; CHECK-LABEL: @dup_ext_f32( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[TMP1]], i32 0 @@ -75,7 +76,7 @@ ret %out } -define @dup_ext_f64( %data) { +define @dup_ext_f64( %data) #0 { ; CHECK-LABEL: @dup_ext_f64( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[TMP1]], i32 0 @@ -98,3 +99,5 @@ declare @llvm.aarch64.sve.tbl.nxv8f16( , ) declare @llvm.aarch64.sve.tbl.nxv4f32( , ) declare @llvm.aarch64.sve.tbl.nxv2f64( , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll rename from llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll @@ -1,11 +1,13 @@ -; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s +; RUN: opt -S -instcombine < %s | FileCheck %s -define i1 @ptest_any1( %a) { -; OPT-LABEL: ptest_any1 -; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) -; OPT-NOT: convert -; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1( %mask, %a) -; OPT-NEXT: ret i1 %[[OUT]] +target triple = "aarch64-unknown-linux-gnu" + +define i1 @ptest_any1( %a) #0 { +; CHECK-LABEL: ptest_any1 +; CHECK: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) +; CHECK-NOT: convert +; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1( %mask, %a) +; CHECK-NEXT: ret i1 %[[OUT]] %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) @@ -14,12 +16,12 @@ } ; No transform because the ptest is using differently sized operands. -define i1 @ptest_any2( %a) { -; OPT-LABEL: ptest_any2 -; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) -; OPT-NEXT: %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) -; OPT-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) -; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) +define i1 @ptest_any2( %a) #0 { +; CHECK-LABEL: ptest_any2 +; CHECK: %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) +; CHECK-NEXT: %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) +; CHECK-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( %1, %2) %mask = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %mask) %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) @@ -27,12 +29,12 @@ ret i1 %out } -define i1 @ptest_first( %a) { -; OPT-LABEL: ptest_first -; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) -; OPT-NOT: convert -; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %mask, %a) -; OPT-NEXT: ret i1 %[[OUT]] +define i1 @ptest_first( %a) #0 { +; CHECK-LABEL: ptest_first +; CHECK: %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) +; CHECK-NOT: convert +; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %mask, %a) +; CHECK-NEXT: ret i1 %[[OUT]] %mask = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %mask) %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %a) @@ -40,22 +42,22 @@ ret i1 %out } -define i1 @ptest_first_same_ops( %a) { -; OPT-LABEL: ptest_first_same_ops -; OPT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv2i1( %a, %a) -; OPT-NOT: convert -; OPT-NEXT: ret i1 %[[OUT]] +define i1 @ptest_first_same_ops( %a) #0 { +; CHECK-LABEL: ptest_first_same_ops +; CHECK: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv2i1( %a, %a) +; CHECK-NOT: convert +; CHECK-NEXT: ret i1 %[[OUT]] %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %a) %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1( %1, %1) ret i1 %2 } -define i1 @ptest_last( %a) { -; OPT-LABEL: ptest_last -; OPT: %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) -; OPT-NOT: convert -; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1( %mask, %a) -; OPT-NEXT: ret i1 %[[OUT]] +define i1 @ptest_last( %a) #0 { +; CHECK-LABEL: ptest_last +; CHECK: %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; CHECK-NOT: convert +; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1( %mask, %a) +; CHECK-NEXT: ret i1 %[[OUT]] %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %mask) %2 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %a) @@ -75,3 +77,5 @@ declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() + +attributes #0 = { "target-features"="+sve" }