diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1103,8 +1103,8 @@ /// is using a compare with the specified predicate as condition. When vector /// types are passed, \p VecPred must be used for all lanes. InstructionCost - getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr, - CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE, + getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, const Instruction *I = nullptr) const; diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -348,6 +348,9 @@ SinkAndHoistLICMFlags *LICMFlags = nullptr, OptimizationRemarkEmitter *ORE = nullptr); +/// Returns the comparison predicate used when expanding a min/max reduction. +CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK); + /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -889,32 +889,28 @@ return true; } -Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, - Value *Right) { - CmpInst::Predicate Pred; +CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) { switch (RK) { default: llvm_unreachable("Unknown min/max recurrence kind"); case RecurKind::UMin: - Pred = CmpInst::ICMP_ULT; - break; + return CmpInst::ICMP_ULT; case RecurKind::UMax: - Pred = CmpInst::ICMP_UGT; - break; + return CmpInst::ICMP_UGT; case RecurKind::SMin: - Pred = CmpInst::ICMP_SLT; - break; + return CmpInst::ICMP_SLT; case RecurKind::SMax: - Pred = CmpInst::ICMP_SGT; - break; + return CmpInst::ICMP_SGT; case RecurKind::FMin: - Pred = CmpInst::FCMP_OLT; - break; + return CmpInst::FCMP_OLT; case RecurKind::FMax: - Pred = CmpInst::FCMP_OGT; - break; + return CmpInst::FCMP_OGT; } +} +Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, + Value *Right) { + CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK); Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8579,28 +8579,32 @@ } case RecurKind::FMax: case RecurKind::FMin: { + auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, /*unsigned=*/false, CostKind); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); + CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, + SclCondTy, RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + SclCondTy, RdxPred, CostKind); break; } case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: { + auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); bool IsUnsigned = RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, CostKind); - ScalarCost = - TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); + CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + SclCondTy, RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + SclCondTy, RdxPred, CostKind); break; } default: diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -82,7 +82,7 @@ ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - unsigned Opcode, + const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex); void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, @@ -299,12 +299,13 @@ /// \p ConvertToShuffle to that extract instruction. bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - unsigned Opcode, + const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex) { assert(isa(Ext0->getOperand(1)) && isa(Ext1->getOperand(1)) && "Expected constant extract indexes"); + unsigned Opcode = I.getOpcode(); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast(Ext0->getOperand(0)->getType()); InstructionCost ScalarOpCost, VectorOpCost; @@ -317,10 +318,11 @@ } else { assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Expected a compare"); - ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy)); - VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy, - CmpInst::makeCmpResultType(VecTy)); + CmpInst::Predicate Pred = cast(I).getPredicate(); + ScalarOpCost = TTI.getCmpSelInstrCost( + Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); + VectorOpCost = TTI.getCmpSelInstrCost( + Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); } // Get cost estimates for the extract elements. These costs will factor into @@ -495,8 +497,7 @@ m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); ExtractElementInst *ExtractToChange; - if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange, - InsertIndex)) + if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) return false; if (ExtractToChange) { @@ -640,8 +641,11 @@ unsigned Opcode = I.getOpcode(); InstructionCost ScalarOpCost, VectorOpCost; if (IsCmp) { - ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); - VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); + CmpInst::Predicate Pred = cast(I).getPredicate(); + ScalarOpCost = TTI.getCmpSelInstrCost( + Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); + VectorOpCost = TTI.getCmpSelInstrCost( + Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); } else { ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); @@ -741,7 +745,10 @@ InstructionCost OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); - OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; + OldCost += + TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), + CmpInst::makeCmpResultType(I0->getType()), Pred) * + 2; OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); // The proposed vector pattern is: @@ -750,7 +757,8 @@ int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast(CmpInst::makeCmpResultType(X->getType())); - InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); + InstructionCost NewCost = TTI.getCmpSelInstrCost( + CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); SmallVector ShufMask(VecTy->getNumElements(), UndefMaskElem); ShufMask[CheapIndex] = ExpensiveIndex; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll @@ -1,15 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX define i1 @fcmp_and_v2f64(<2 x double> %a) { -; CHECK-LABEL: @fcmp_and_v2f64( -; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 -; CHECK-NEXT: [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1 -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01 -; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00 -; CHECK-NEXT: [[R:%.*]] = and i1 [[CMP1]], [[CMP2]] -; CHECK-NEXT: ret i1 [[R]] +; SSE-LABEL: @fcmp_and_v2f64( +; SSE-NEXT: [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 +; SSE-NEXT: [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1 +; SSE-NEXT: [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01 +; SSE-NEXT: [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00 +; SSE-NEXT: [[R:%.*]] = and i1 [[CMP1]], [[CMP2]] +; SSE-NEXT: ret i1 [[R]] +; +; AVX-LABEL: @fcmp_and_v2f64( +; AVX-NEXT: [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; AVX-NEXT: ret i1 [[R]] ; %e1 = extractelement <2 x double> %a, i32 0 %e2 = extractelement <2 x double> %a, i32 1 @@ -20,13 +27,20 @@ } define i1 @fcmp_or_v4f64(<4 x double> %a) { -; CHECK-LABEL: @fcmp_or_v4f64( -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 -; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2 -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01 -; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00 -; CHECK-NEXT: [[R:%.*]] = or i1 [[CMP1]], [[CMP2]] -; CHECK-NEXT: ret i1 [[R]] +; SSE-LABEL: @fcmp_or_v4f64( +; SSE-NEXT: [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SSE-NEXT: [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2 +; SSE-NEXT: [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01 +; SSE-NEXT: [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00 +; SSE-NEXT: [[R:%.*]] = or i1 [[CMP1]], [[CMP2]] +; SSE-NEXT: ret i1 [[R]] +; +; AVX-LABEL: @fcmp_or_v4f64( +; AVX-NEXT: [[TMP1:%.*]] = fcmp olt <4 x double> [[A:%.*]], +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 +; AVX-NEXT: ret i1 [[R]] ; %e1 = extractelement <4 x double> %a, i32 0 %e2 = extractelement <4 x double> %a, i64 2 @@ -38,11 +52,10 @@ define i1 @icmp_xor_v4i32(<4 x i32> %a) { ; CHECK-LABEL: @icmp_xor_v4i32( -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3 -; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[E1]], 42 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[E2]], -8 -; CHECK-NEXT: [[R:%.*]] = xor i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 ; CHECK-NEXT: ret i1 [[R]] ; %e1 = extractelement <4 x i32> %a, i32 3 @@ -56,13 +69,20 @@ ; add is not canonical (should be xor), but that is ok. define i1 @icmp_add_v8i32(<8 x i32> %a) { -; CHECK-LABEL: @icmp_add_v8i32( -; CHECK-NEXT: [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7 -; CHECK-NEXT: [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[E1]], 42 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[E2]], -8 -; CHECK-NEXT: [[R:%.*]] = add i1 [[CMP1]], [[CMP2]] -; CHECK-NEXT: ret i1 [[R]] +; SSE-LABEL: @icmp_add_v8i32( +; SSE-NEXT: [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7 +; SSE-NEXT: [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[CMP1:%.*]] = icmp eq i32 [[E1]], 42 +; SSE-NEXT: [[CMP2:%.*]] = icmp eq i32 [[E2]], -8 +; SSE-NEXT: [[R:%.*]] = add i1 [[CMP1]], [[CMP2]] +; SSE-NEXT: ret i1 [[R]] +; +; AVX-LABEL: @icmp_add_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = icmp eq <8 x i32> [[A:%.*]], +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = add <8 x i1> [[TMP1]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2 +; AVX-NEXT: ret i1 [[R]] ; %e1 = extractelement <8 x i32> %a, i32 7 %e2 = extractelement <8 x i32> %a, i32 2 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -1,30 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) { ; CHECK-LABEL: @cmp_v4i32( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[T:%.*]] = bitcast <4 x float> [[ARG:%.*]] to <4 x i32> -; CHECK-NEXT: [[T2:%.*]] = extractelement <4 x i32> [[T]], i32 0 ; CHECK-NEXT: [[T3:%.*]] = bitcast <4 x float> [[ARG1:%.*]] to <4 x i32> -; CHECK-NEXT: [[T4:%.*]] = extractelement <4 x i32> [[T3]], i32 0 -; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[T2]], [[T4]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[T5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[T5]], label [[BB6:%.*]], label [[BB18:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[T7:%.*]] = extractelement <4 x i32> [[T]], i32 1 -; CHECK-NEXT: [[T8:%.*]] = extractelement <4 x i32> [[T3]], i32 1 -; CHECK-NEXT: [[T9:%.*]] = icmp eq i32 [[T7]], [[T8]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[T9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 ; CHECK-NEXT: br i1 [[T9]], label [[BB10:%.*]], label [[BB18]] ; CHECK: bb10: -; CHECK-NEXT: [[T11:%.*]] = extractelement <4 x i32> [[T]], i32 2 -; CHECK-NEXT: [[T12:%.*]] = extractelement <4 x i32> [[T3]], i32 2 -; CHECK-NEXT: [[T13:%.*]] = icmp eq i32 [[T11]], [[T12]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[T13:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 ; CHECK-NEXT: br i1 [[T13]], label [[BB14:%.*]], label [[BB18]] ; CHECK: bb14: -; CHECK-NEXT: [[T15:%.*]] = extractelement <4 x i32> [[T]], i32 3 -; CHECK-NEXT: [[T16:%.*]] = extractelement <4 x i32> [[T3]], i32 3 -; CHECK-NEXT: [[T17:%.*]] = icmp eq i32 [[T15]], [[T16]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[T]], [[T3]] +; CHECK-NEXT: [[T17:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-NEXT: br label [[BB18]] ; CHECK: bb18: ; CHECK-NEXT: [[T19:%.*]] = phi i1 [ false, [[BB10]] ], [ false, [[BB6]] ], [ false, [[BB:%.*]] ], [ [[T17]], [[BB14]] ] @@ -62,19 +58,32 @@ } define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { -; CHECK-LABEL: @cmp_v2f64( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]] -; CHECK-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]] -; CHECK: t: -; CHECK-NEXT: [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1 -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]] -; CHECK-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99 -; CHECK-NEXT: ret i32 [[E]] -; CHECK: f: -; CHECK-NEXT: ret i32 0 +; SSE-LABEL: @cmp_v2f64( +; SSE-NEXT: entry: +; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 +; SSE-NEXT: [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]] +; SSE-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]] +; SSE: t: +; SSE-NEXT: [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1 +; SSE-NEXT: [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]] +; SSE-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99 +; SSE-NEXT: ret i32 [[E]] +; SSE: f: +; SSE-NEXT: ret i32 0 +; +; AVX-LABEL: @cmp_v2f64( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]] +; AVX-NEXT: [[CMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; AVX-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]] +; AVX: t: +; AVX-NEXT: [[TMP1:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]] +; AVX-NEXT: [[CMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; AVX-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99 +; AVX-NEXT: ret i32 [[E]] +; AVX: f: +; AVX-NEXT: ret i32 0 ; entry: %x1 = extractelement <2 x double> %x, i32 1 @@ -93,11 +102,17 @@ } define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @cmp01_v2f64( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]] -; CHECK-NEXT: ret i1 [[CMP]] +; SSE-LABEL: @cmp01_v2f64( +; SSE-NEXT: [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 +; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 +; SSE-NEXT: [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]] +; SSE-NEXT: ret i1 [[CMP]] +; +; AVX-LABEL: @cmp01_v2f64( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> poison, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[SHIFT]] +; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; AVX-NEXT: ret i1 [[CMP]] ; %x0 = extractelement <2 x double> %x, i32 0 %y1 = extractelement <2 x double> %y, i32 1 @@ -106,11 +121,17 @@ } define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @cmp10_v2f64( -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]] -; CHECK-NEXT: ret i1 [[CMP]] +; SSE-LABEL: @cmp10_v2f64( +; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; SSE-NEXT: [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0 +; SSE-NEXT: [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]] +; SSE-NEXT: ret i1 [[CMP]] +; +; AVX-LABEL: @cmp10_v2f64( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fcmp ule <2 x double> [[SHIFT]], [[Y:%.*]] +; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0 +; AVX-NEXT: ret i1 [[CMP]] ; %x1 = extractelement <2 x double> %x, i32 1 %y0 = extractelement <2 x double> %y, i32 0 @@ -120,9 +141,9 @@ define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @cmp12_v4i32( -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X1]], [[Y2]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[SHIFT]] +; CHECK-NEXT: [[CMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %x1 = extractelement <4 x i32> %x, i32 1 @@ -132,12 +153,19 @@ } define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) { -; CHECK-LABEL: @ins_fcmp_ext_ext( -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2 -; CHECK-NEXT: ret <4 x i1> [[R]] +; SSE-LABEL: @ins_fcmp_ext_ext( +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SSE-NEXT: [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2 +; SSE-NEXT: ret <4 x i1> [[R]] +; +; AVX-LABEL: @ins_fcmp_ext_ext( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fcmp ugt <4 x float> [[A]], [[SHIFT]] +; AVX-NEXT: [[A21:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2 +; AVX-NEXT: ret <4 x i1> [[R]] ; %a1 = extractelement <4 x float> %a, i32 1 %a2 = extractelement <4 x float> %a, i32 2 diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll @@ -132,8 +132,7 @@ ; CHECK-NEXT: call void @usef(<4 x float> [[I0]]) ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i32 2 ; CHECK-NEXT: call void @usef(<4 x float> [[I1]]) -; CHECK-NEXT: [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> poison, i1 [[R_SCALAR]], i64 2 +; CHECK-NEXT: [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]] ; CHECK-NEXT: ret <4 x i1> [[R]] ; %i0 = insertelement <4 x float> poison, float %x, i32 2 diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll --- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll @@ -132,8 +132,7 @@ ; CHECK-NEXT: call void @usef(<4 x float> [[I0]]) ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2 ; CHECK-NEXT: call void @usef(<4 x float> [[I1]]) -; CHECK-NEXT: [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 2 +; CHECK-NEXT: [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]] ; CHECK-NEXT: ret <4 x i1> [[R]] ; %i0 = insertelement <4 x float> undef, float %x, i32 2