diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3665,8 +3665,76 @@ bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. - if (!TE.ReuseShuffleIndices.empty()) - return None; + if (!TE.ReuseShuffleIndices.empty()) { + // Check if reuse shuffle indices can be improved by reordering. + // For this, check that reuse mask is "clustered", i.e. each scalar values + // is used once in each submask of size . + // Example: 4 scalar values. + // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. + // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because + // element 3 is used twice in the second submask. + unsigned Sz = TE.Scalars.size(); + unsigned VF = TE.getVectorFactor(); + auto &&IsClusteredReuse = [Sz, VF](const TreeEntry &TE) { + for (unsigned K = 0; K < VF; K += Sz) { + ArrayRef SubMask = + makeArrayRef(TE.ReuseShuffleIndices).slice(K, Sz); + if (all_of(SubMask, [](int Idx) { return Idx == UndefMaskElem; })) + continue; + SmallBitVector UsedIndices(Sz); + for (int Idx : SubMask) { + if (Idx == UndefMaskElem) + return false; + UsedIndices.set(Idx); + } + if (!UsedIndices.all()) + return false; + } + return true; + }; + if (!IsClusteredReuse(TE)) + return None; + // Try build correct order for extractelement instructions. + SmallVector ReusedMask(TE.ReuseShuffleIndices.begin(), + TE.ReuseShuffleIndices.end()); + if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && + all_of(TE.Scalars, [Sz](Value *V) { + Optional Idx = getExtractIndex(cast(V)); + return Idx && *Idx < Sz; + })) { + SmallVector ReorderMask(Sz, UndefMaskElem); + if (TE.ReorderIndices.empty()) + std::iota(ReorderMask.begin(), ReorderMask.end(), 0); + else + inversePermutation(TE.ReorderIndices, ReorderMask); + for (unsigned I = 0; I < VF; ++I) { + int &Idx = ReusedMask[I]; + if (Idx == UndefMaskElem) + continue; + Value *V = TE.Scalars[ReorderMask[Idx]]; + Optional EI = getExtractIndex(cast(V)); + Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); + } + } + // Build the order of the VF size, need to reorder reuses shuffles, they are + // always of VF size. + OrdersType ResOrder(VF); + std::iota(ResOrder.begin(), ResOrder.end(), 0); + auto *It = ResOrder.begin(); + for (unsigned K = 0; K < VF; K += Sz) { + OrdersType CurrentOrder(TE.ReorderIndices); + SmallVector SubMask(makeArrayRef(ReusedMask).slice(K, Sz)); + if (SubMask.front() == UndefMaskElem) + std::iota(SubMask.begin(), SubMask.end(), 0); + reorderOrder(CurrentOrder, SubMask); + transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); + std::advance(It, Sz); + } + if (all_of(enumerate(ResOrder), + [](const auto &Data) { return Data.index() == Data.value(); })) + return {}; // Use identity order. + return ResOrder; + } if (TE.State == TreeEntry::Vectorize && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && @@ -3783,8 +3851,8 @@ UserTE = UserTE->UserTreeIndices.back().UserTE; ++Cnt; } - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -3808,12 +3876,13 @@ for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) { + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) { auto It = GathersToOrders.find(OpTE); if (It != GathersToOrders.end()) return It->second; @@ -3830,7 +3899,15 @@ if (It != ExternalUserReorderMap.end()) { const auto &ExternalUserReorderIndices = It->second; for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + // If the OpTE vector factor != number of scalars (ExtOrder size) - + // use natural order, it is an attempt to reorder node with reused + // scalars but with external uses. + ++OrdersUses + .insert(std::make_pair( + OpTE->getVectorFactor() == ExtOrder.size() ? ExtOrder + : OrdersType(), + 0)) + .first->second; // No other useful reorder data in this entry. if (Order.empty()) continue; @@ -3990,7 +4067,7 @@ if (Optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -4062,10 +4139,11 @@ TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) continue; - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); @@ -5499,7 +5577,7 @@ ((P0 == CurrentPred && !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || (AltP0 == CurrentPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) + !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) std::swap(LHS, RHS); } else if (P0 != CurrentPred && AltP0 != CurrentPred) { std::swap(LHS, RHS); @@ -5804,7 +5882,10 @@ CmpInst::Predicate CurrentPred = CI->getPredicate(); if (P0 == AltP0Swapped) return I == AltCI0 || - (I != MainOp && + (I != CI0 && AltP0 == CurrentPred && + areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + CI->getOperand(0), CI->getOperand(1))) || + (I != CI0 && P0 == CurrentPred && !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), CI->getOperand(0), CI->getOperand(1))); return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -86,15 +86,16 @@ define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR]], -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[T_FR]], [[SHIFT]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <8 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <8 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[SHIFT]], [[T]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] ; @@ -400,14 +401,16 @@ define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[T_FR]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP2]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP5]], i64 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll @@ -5,16 +5,15 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = load i16, i16* undef, align 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CALL37]], i32 6 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16> -; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP8]], 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <8 x i16> [[SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP6]], 0 ; CHECK-NEXT: ret i16 [[OP_RDX]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll @@ -49,7 +49,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV_I32_I_I_I1]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x float> zeroinitializer, <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -11,18 +11,19 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* undef, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], undef ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], undef ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], undef ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[INS1:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP10]], 0 -; CHECK-NEXT: [[INS2:%.*]] = insertvalue { <2 x float>, <2 x float> } [[INS1]], <2 x float> [[TMP11]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[INS1:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP11]], 0 +; CHECK-NEXT: [[INS2:%.*]] = insertvalue { <2 x float>, <2 x float> } [[INS1]], <2 x float> [[TMP12]], 1 ; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[INS2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -228,14 +228,13 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) +; CHECK-NEXT: ret i1 [[TMP5]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -261,16 +260,15 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use1(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; CHECK-NEXT: call void @use1(i1 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: ret i1 [[TMP6]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -345,11 +343,11 @@ ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X0]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6