diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -417,6 +417,7 @@ void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeUnreachableBlockElimLegacyPassPass(PassRegistry&); void initializeUnreachableMachineBlockElimPass(PassRegistry&); +void initializeVectorCombineLegacyPassPass(PassRegistry&); void initializeVerifierLegacyPassPass(PassRegistry&); void initializeVirtRegMapPass(PassRegistry&); void initializeVirtRegRewriterPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -213,6 +213,7 @@ (void) llvm::createLoopVectorizePass(); (void) llvm::createSLPVectorizerPass(); (void) llvm::createLoadStoreVectorizerPass(); + (void) llvm::createVectorCombinePass(); (void) llvm::createPartiallyInlineLibCallsPass(); (void) llvm::createScalarizerPass(); (void) llvm::createSeparateConstOffsetFromGEPPass(); diff --git a/llvm/include/llvm/Transforms/Vectorize.h b/llvm/include/llvm/Transforms/Vectorize.h --- a/llvm/include/llvm/Transforms/Vectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize.h @@ -138,6 +138,12 @@ // Pass *createLoadStoreVectorizerPass(); +//===----------------------------------------------------------------------===// +// +// Optimize partial vector operations using target cost models. +// +Pass *createVectorCombinePass(); + } // End llvm namespace #endif diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h b/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h @@ -0,0 +1,30 @@ +//===-------- VectorCombine.h - Optimize partial vector operations --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes scalar/vector interactions using target cost models. The +// transforms implemented here may not fit in traditional loop-based or SLP +// vectorization passes. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTOR_VECTORCOMBINE_H +#define LLVM_TRANSFORMS_VECTOR_VECTORCOMBINE_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// Optimize scalar/vector interactions in IR using target cost models. +struct VectorCombinePass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &); +}; + +} +#endif // LLVM_TRANSFORMS_VECTOR_VECTORCOMBINE_H + diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -185,6 +185,7 @@ #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/Transforms/Vectorize/VectorCombine.h" using namespace llvm; @@ -956,6 +957,7 @@ OptimizePM.addPass(LoopLoadEliminationPass()); // Cleanup after the loop optimization passes. + OptimizePM.addPass(VectorCombinePass()); OptimizePM.addPass(InstCombinePass()); // Now that we've formed fast to execute loop structures, we do further @@ -974,8 +976,10 @@ sinkCommonInsts(true))); // Optimize parallel scalar instruction chains into SIMD instructions. - if (PTO.SLPVectorization) + if (PTO.SLPVectorization) { OptimizePM.addPass(SLPVectorizerPass()); + OptimizePM.addPass(VectorCombinePass()); + } OptimizePM.addPass(InstCombinePass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -240,6 +240,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) FUNCTION_PASS("unroll-and-jam", LoopUnrollAndJamPass()) +FUNCTION_PASS("vector-combine", VectorCombinePass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -47,6 +47,7 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/Transforms/Vectorize/VectorCombine.h" using namespace llvm; @@ -738,6 +739,7 @@ // on -O1 and no #pragma is found). Would be good to have these two passes // as function calls, so that we can only pass them when the vectorizer // changed the code. + MPM.add(createVectorCombinePass()); addInstructionCombiningPass(MPM); if (OptLevel > 1 && ExtraVectorizerPasses) { // At higher optimization levels, try to clean up any runtime overlap and @@ -764,6 +766,7 @@ if (SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + MPM.add(createVectorCombinePass()); if (OptLevel > 1 && ExtraVectorizerPasses) { MPM.add(createEarlyCSEPass()); } @@ -1001,6 +1004,7 @@ // Now that we've optimized loops (in particular loop induction variables), // we may have exposed more scalar opportunities. Run parts of the scalar // optimizer again at this point. + PM.add(createVectorCombinePass()); addInstructionCombiningPass(PM); // Initial cleanup PM.add(createCFGSimplificationPass()); // if-convert PM.add(createSCCPPass()); // Propagate exposed constants @@ -1008,8 +1012,10 @@ PM.add(createBitTrackingDCEPass()); // More scalar chains could be vectorized due to more alias information - if (SLPVectorize) + if (SLPVectorize) { PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + PM.add(createVectorCombinePass()); // Clean up partial vectorization. + } // After vectorization, assume intrinsics may tell us more about pointer // alignments. diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -4,6 +4,7 @@ LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp + VectorCombine.cpp VPlan.cpp VPlanHCFGBuilder.cpp VPlanPredicator.cpp diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -0,0 +1,160 @@ +//===------- VectorCombine.cpp - Optimize partial vector operations -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes scalar/vector interactions using target cost models. The +// transforms implemented here may not fit in traditional loop-based or SLP +// vectorization passes. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecCmp, "Number of vector compares formed"); +DEBUG_COUNTER(VecCombineCounter, "vector-combine-transform", + "Controls transformations in vector-combine pass"); + +static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) { + // Match a cmp with extracted vector operands. + CmpInst::Predicate Pred; + Instruction *Ext0, *Ext1; + if (!match(&I, m_Cmp(Pred, m_Instruction(Ext0), m_Instruction(Ext1)))) + return false; + + Value *V0, *V1; + ConstantInt *C; + if (!match(Ext0, m_ExtractElement(m_Value(V0), m_ConstantInt(C))) || + !match(Ext1, m_ExtractElement(m_Value(V1), m_Specific(C))) || + V0->getType() != V1->getType()) + return false; + + Type *ScalarTy = Ext0->getType(); + Type *VecTy = V0->getType(); + bool IsFP = ScalarTy->isFloatingPointTy(); + unsigned CmpOpcode = IsFP ? Instruction::FCmp : Instruction::ICmp; + + // Check if the existing scalar code or the vector alternative is cheaper. + // Extra uses of the extracts mean that we include those costs in the + // vector total because those instructions will not be eliminated. + // ((2 * extract) + scalar cmp) < (vector cmp + extract) ? + int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VecTy, C->getZExtValue()); + int ScalarCmpCost = TTI.getOperationCost(CmpOpcode, ScalarTy); + int VecCmpCost = TTI.getOperationCost(CmpOpcode, VecTy); + + int ScalarCost = 2 * ExtractCost + ScalarCmpCost; + int VecCost = VecCmpCost + ExtractCost + + !Ext0->hasOneUse() * ExtractCost + + !Ext1->hasOneUse() * ExtractCost; + if (ScalarCost < VecCost) + return false; + + // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C + ++NumVecCmp; + IRBuilder<> Builder(&I); + Value *VecCmp = IsFP ? Builder.CreateFCmp(Pred, V0, V1) + : Builder.CreateICmp(Pred, V0, V1); + Value *Ext = Builder.CreateExtractElement(VecCmp, C); + I.replaceAllUsesWith(Ext); + return true; +} + +/// This is the entry point for all transforms. Pass manager differences are +/// handled in the callers of this function. +static bool runImpl(Function &F, const TargetTransformInfo &TTI, + const DominatorTree &DT) { + bool MadeChange = false; + for (BasicBlock &BB : F) { + // Ignore unreachable basic blocks. + if (!DT.isReachableFromEntry(&BB)) + continue; + // Do not delete instructions under here and invalidate the iterator. + // Walk the block backwards for efficiency. We're matching a chain of + // use->defs, so we're more likely to succeed by starting from the bottom. + // TODO: It could be more efficient to remove dead instructions + // iteratively in this loop rather than waiting until the end. + for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { + MadeChange |= foldExtractCmp(I, TTI); + // TODO: More transforms go here. + } + } + + // We're done with transforms, so remove dead instructions. + if (MadeChange) + for (BasicBlock &BB : F) + SimplifyInstructionsInBlock(&BB); + + return MadeChange; +} + +// Pass manager boilerplate below here. + +namespace { +class VectorCombineLegacyPass : public FunctionPass { +public: + static char ID; + VectorCombineLegacyPass() : FunctionPass(ID) { + initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &TTI = getAnalysis().getTTI(F); + auto &DT = getAnalysis().getDomTree(); + return runImpl(F, TTI, DT); + } +}; +} // namespace + +char VectorCombineLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine", + "Optimize scalar/vector ops", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine", + "Optimize scalar/vector ops", false, false) +Pass *llvm::createVectorCombinePass() { + return new VectorCombineLegacyPass(); +} + +PreservedAnalyses VectorCombinePass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetTransformInfo &TTI = FAM.getResult(F); + DominatorTree &DT = FAM.getResult(F); + if (!runImpl(F, TTI, DT)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet(); + PA.preserve(); + return PA; +} diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp --- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -21,12 +21,12 @@ using namespace llvm; -/// initializeVectorizationPasses - Initialize all passes linked into the -/// Vectorization library. +/// Initialize all passes linked into the Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { initializeLoopVectorizePass(Registry); initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerLegacyPassPass(Registry); + initializeVectorCombineLegacyPassPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -253,11 +253,15 @@ ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O2-NEXT: Running pass: VectorCombinePass +; CHECK-O3-NEXT: Running pass: VectorCombinePass +; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -223,11 +223,15 @@ ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-POSTLINK-O2-NEXT: Running pass: VectorCombinePass +; CHECK-POSTLINK-O3-NEXT: Running pass: VectorCombinePass +; CHECK-POSTLINK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -191,11 +191,15 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O2-NEXT: Running pass: VectorCombinePass +; CHECK-O3-NEXT: Running pass: VectorCombinePass +; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -202,11 +202,15 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O2-NEXT: Running pass: VectorCombinePass +; CHECK-O3-NEXT: Running pass: VectorCombinePass +; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -237,6 +237,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination +; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -254,6 +255,9 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -242,6 +242,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination +; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -259,6 +260,9 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -224,6 +224,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination +; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -241,6 +242,9 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s + +define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) { +; CHECK-LABEL: @cmp_v4i32( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[T:%.*]] = bitcast <4 x float> [[ARG:%.*]] to <4 x i32> +; CHECK-NEXT: [[T3:%.*]] = bitcast <4 x float> [[ARG1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[BB6:%.*]], label [[BB18:%.*]] +; CHECK: bb6: +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP3]], label [[BB10:%.*]], label [[BB18]] +; CHECK: bb10: +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-NEXT: br i1 [[TMP5]], label [[BB14:%.*]], label [[BB18]] +; CHECK: bb14: +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3 +; CHECK-NEXT: br label [[BB18]] +; CHECK: bb18: +; CHECK-NEXT: [[T19:%.*]] = phi i1 [ false, [[BB10]] ], [ false, [[BB6]] ], [ false, [[BB:%.*]] ], [ [[TMP7]], [[BB14]] ] +; CHECK-NEXT: ret i1 [[T19]] +; +bb: + %t = bitcast <4 x float> %arg to <4 x i32> + %t2 = extractelement <4 x i32> %t, i32 0 + %t3 = bitcast <4 x float> %arg1 to <4 x i32> + %t4 = extractelement <4 x i32> %t3, i32 0 + %t5 = icmp eq i32 %t2, %t4 + br i1 %t5, label %bb6, label %bb18 + +bb6: + %t7 = extractelement <4 x i32> %t, i32 1 + %t8 = extractelement <4 x i32> %t3, i32 1 + %t9 = icmp eq i32 %t7, %t8 + br i1 %t9, label %bb10, label %bb18 + +bb10: + %t11 = extractelement <4 x i32> %t, i32 2 + %t12 = extractelement <4 x i32> %t3, i32 2 + %t13 = icmp eq i32 %t11, %t12 + br i1 %t13, label %bb14, label %bb18 + +bb14: + %t15 = extractelement <4 x i32> %t, i32 3 + %t16 = extractelement <4 x i32> %t3, i32 3 + %t17 = icmp eq i32 %t15, %t16 + br label %bb18 + +bb18: + %t19 = phi i1 [ false, %bb10 ], [ false, %bb6 ], [ false, %bb ], [ %t17, %bb14 ] + ret i1 %t19 +} + +define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { +; CHECK-LABEL: @cmp_v2f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]] +; CHECK: t: +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99 +; CHECK-NEXT: ret i32 [[E]] +; CHECK: f: +; CHECK-NEXT: ret i32 0 +; +entry: + %x1 = extractelement <2 x double> %x, i32 1 + %y1 = extractelement <2 x double> %y, i32 1 + %cmp1 = fcmp oeq double %x1, %y1 + br i1 %cmp1, label %t, label %f + +t: + %z1 = extractelement <2 x double> %z, i32 1 + %cmp2 = fcmp ogt double %y1, %z1 + %e = select i1 %cmp2, i32 42, i32 99 + ret i32 %e + +f: + ret i32 0 +} diff --git a/llvm/test/Transforms/VectorCombine/X86/lit.local.cfg b/llvm/test/Transforms/VectorCombine/X86/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True