diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4544,10 +4544,11 @@ Value *RHS = Cmp->getOperand(1); CmpInst::Predicate CurrentPred = Cmp->getPredicate(); if (P0 == AltP0Swapped) { - if ((P0 == CurrentPred && - !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || - (AltP0 == CurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))) + if (CI != Cmp && S.AltOp != Cmp && + ((P0 == CurrentPred && + !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || + (AltP0 == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) std::swap(LHS, RHS); } else if (P0 != CurrentPred && AltP0 != CurrentPred) { std::swap(LHS, RHS); @@ -4835,6 +4836,29 @@ } } +/// Checks if the specified instruction \p I is an alternate operation for the +/// given \p MainOp and \p AltOp instructions. +static bool isAlternateInstruction(const Instruction *I, + const Instruction *MainOp, + const Instruction *AltOp) { + if (auto *CI0 = dyn_cast(MainOp)) { + auto *AltCI0 = cast(AltOp); + auto *CI = cast(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + assert(P0 != AltP0 && "Expected different main/alternate predicates."); + CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + if (P0 == AltP0Swapped) + return I == AltCI0 || + (I != MainOp && + !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(0), CI->getOperand(1))); + return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; + } + return I->getOpcode() == AltOp->getOpcode(); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -5560,28 +5584,7 @@ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - if (auto *CI0 = dyn_cast(E->getMainOp())) { - auto *AltCI0 = cast(E->getAltOp()); - auto *CI = cast(I); - CmpInst::Predicate P0 = CI0->getPredicate(); - CmpInst::Predicate AltP0 = AltCI0->getPredicate(); - assert(P0 != AltP0 && - "Expected different main/alternate predicates."); - CmpInst::Predicate AltP0Swapped = - CmpInst::getSwappedPredicate(AltP0); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - if (P0 == AltP0Swapped) - return (P0 == CurrentPred && - !areCompatibleCmpOps( - CI0->getOperand(0), CI0->getOperand(1), - CI->getOperand(0), CI->getOperand(1))) || - (AltP0 == CurrentPred && - !areCompatibleCmpOps( - CI0->getOperand(0), CI0->getOperand(1), - CI->getOperand(1), CI->getOperand(0))); - return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; - } - return I->getOpcode() == E->getAltOpcode(); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); }, Mask); CommonCost = @@ -7081,10 +7084,6 @@ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); auto *AltCI = cast(E->getAltOp()); CmpInst::Predicate AltPred = AltCI->getPredicate(); - unsigned AltIdx = - std::distance(E->Scalars.begin(), find(E->Scalars, AltCI)); - if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx]) - AltPred = CmpInst::getSwappedPredicate(AltPred); V1 = Builder.CreateCmp(AltPred, LHS, RHS); } else { V0 = Builder.CreateCast( @@ -7110,28 +7109,7 @@ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - if (auto *CI0 = dyn_cast(E->getMainOp())) { - auto *AltCI0 = cast(E->getAltOp()); - auto *CI = cast(I); - CmpInst::Predicate P0 = CI0->getPredicate(); - CmpInst::Predicate AltP0 = AltCI0->getPredicate(); - assert(P0 != AltP0 && - "Expected different main/alternate predicates."); - CmpInst::Predicate AltP0Swapped = - CmpInst::getSwappedPredicate(AltP0); - CmpInst::Predicate CurrentPred = CI->getPredicate(); - if (P0 == AltP0Swapped) - return (P0 == CurrentPred && - !areCompatibleCmpOps( - CI0->getOperand(0), CI0->getOperand(1), - CI->getOperand(0), CI->getOperand(1))) || - (AltP0 == CurrentPred && - !areCompatibleCmpOps( - CI0->getOperand(0), CI0->getOperand(1), - CI->getOperand(1), CI->getOperand(0))); - return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; - } - return I->getOpcode() == E->getAltOpcode(); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); }, Mask, &OpScalars, &AltScalars); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll @@ -5,14 +5,16 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = load i16, i16* undef, align 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP0]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i16 [[TMP5]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CALL37]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]]) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i16 [[TMP8]], 0 ; CHECK-NEXT: ret i16 [[OP_EXTRA]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll @@ -46,20 +46,21 @@ ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV_I32_I_I_I1:%.*]] = fptosi float 0.000000e+00 to i32 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[CONV_I32_I_I_I:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[CONV_I32_I_I_I1]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP7]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2 -; CHECK-NEXT: [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3 -; CHECK-NEXT: [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[CONV_I32_I_I_I:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV_I32_I_I_I1]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x float> zeroinitializer, <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 1 +; CHECK-NEXT: [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; CHECK-NEXT: [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; CHECK-NEXT: [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP10]], i64 1 ; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } zeroinitializer, <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT7]], 0 ; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[RETVAL_SROA_7_12_VEC_INSERT13]], 1 ; CHECK-NEXT: ret { <2 x float>, <2 x float> } zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -251,28 +251,53 @@ } define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: call void @use1(i1 [[C2]]) -; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; SSE-NEXT: call void @use1(i1 [[C2]]) +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> +; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) +; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false +; SSE-NEXT: [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false +; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; SSE-NEXT: ret i1 [[S7]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: call void @use1(i1 [[C2]]) +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -395,25 +420,47 @@ } define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_partial( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 -; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 -; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 -; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 -; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; CHECK-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; SSE-LABEL: @logical_and_icmp_clamp_partial( +; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> +; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) +; SSE-NEXT: [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false +; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; SSE-NEXT: ret i1 [[S7]] +; +; AVX-LABEL: @logical_and_icmp_clamp_partial( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1