diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/PassManager.h" @@ -58,6 +59,7 @@ using StoreListMap = MapVector; using GEPList = SmallVector; using GEPListMap = MapVector; + using InstSetVector = SmallSetVector; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; @@ -124,8 +126,8 @@ /// Tries to vectorize constructs started from CmpInst, InsertValueInst or /// InsertElementInst instructions. - bool vectorizeSimpleInstructions(SmallVectorImpl &Instructions, - BasicBlock *BB, slpvectorizer::BoUpSLP &R, + bool vectorizeSimpleInstructions(InstSetVector &Instructions, BasicBlock *BB, + slpvectorizer::BoUpSLP &R, bool AtTerminator); /// Scan the basic block and look for patterns that are likely to start diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11966,9 +11966,9 @@ return IsCompatibility; } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { +bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions, + BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { bool OpsChanged = false; SmallVector PostponedCmps; for (auto *I : reverse(Instructions)) { @@ -12037,9 +12037,10 @@ /*LimitForRegisterSize=*/true); Instructions.clear(); } else { + Instructions.clear(); // Insert in reverse order since the PostponedCmps vector was filled in // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend()); } return OpsChanged; } @@ -12192,7 +12193,7 @@ VisitedInstrs.clear(); - SmallVector PostProcessInstructions; + InstSetVector PostProcessInstructions; SmallDenseSet KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Skip instructions with scalable type. The num of elements is unknown at @@ -12244,8 +12245,12 @@ !DT->isReachableFromEntry(P->getIncomingBlock(I))) continue; - Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), - P->getIncomingBlock(I), R, TTI); + // Postponed instructions should not be vectorized here, delay their + // vectorization. + if (auto *PI = dyn_cast(P->getIncomingValue(I)); + PI && !PostProcessInstructions.contains(PI)) + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } @@ -12272,8 +12277,12 @@ } if (TryToVectorizeRoot) { for (auto *V : it->operand_values()) { - // Try to match and vectorize a horizontal reduction. - OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); + // Postponed instructions should not be vectorized here, delay their + // vectorization. + if (auto *VI = dyn_cast(V); + VI && !PostProcessInstructions.contains(VI)) + // Try to match and vectorize a horizontal reduction. + OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); } } // Start vectorization of post-process list of instructions from the @@ -12292,7 +12301,7 @@ } if (isa(it)) - PostProcessInstructions.push_back(&*it); + PostProcessInstructions.insert(&*it); } return Changed; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -430,21 +430,9 @@ ; Make sure that vectorization happens even if insertelements operations ; must be rescheduled. The case here is from compiling Julia. define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { -; THRESHOLD-LABEL: @reschedule_extract( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @reschedule_extract( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; CHECK-LABEL: @reschedule_extract( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -468,21 +456,9 @@ ; Check that cost model for vectorization takes credit for ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { -; THRESHOLD-LABEL: @take_credit( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @take_credit( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; CHECK-LABEL: @take_credit( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -530,21 +506,9 @@ } define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { -; THRESHOLD-LABEL: @_vadd256( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <8 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @_vadd256( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]] +; CHECK-LABEL: @_vadd256( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -465,21 +465,9 @@ ; Make sure that vectorization happens even if insertelements operations ; must be rescheduled. The case here is from compiling Julia. define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { -; THRESHOLD-LABEL: @reschedule_extract( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @reschedule_extract( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; CHECK-LABEL: @reschedule_extract( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -503,21 +491,9 @@ ; Check that cost model for vectorization takes credit for ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { -; THRESHOLD-LABEL: @take_credit( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @take_credit( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; CHECK-LABEL: @take_credit( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -565,21 +541,9 @@ } define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { -; THRESHOLD-LABEL: @_vadd256( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <8 x float> [[TMP1]] -; -; NOTHRESHOLD-LABEL: @_vadd256( -; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] -; -; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]] +; CHECK-LABEL: @_vadd256( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll @@ -5,23 +5,19 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[I1771:%.*]] = getelementptr inbounds double, double* [[P2:%.*]], i64 54 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1754:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1778:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1754]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[I1771]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1781]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I1792]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> , double [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP13]] -; CHECK-NEXT: ret <4 x double> [[TMP14]] +; CHECK-NEXT: [[I1772:%.*]] = load double, double* [[I1771]], align 8 +; CHECK-NEXT: [[I1795:%.*]] = getelementptr inbounds double, double* [[P2]], i64 55 +; CHECK-NEXT: [[I1796:%.*]] = load double, double* [[I1795]], align 8 +; CHECK-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[TMP3]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: ret <4 x double> [[TMP6]] ; entry: %i1771 = getelementptr inbounds double, double* %p2, i64 54 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE ; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -opaque-pointers -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX2 define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %x, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %y) { ; SSE-LABEL: @compute_min( @@ -123,6 +123,74 @@ ; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; +; AVX2-LABEL: @compute_min( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X:%.*]], i64 0, i64 1 +; AVX2-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y:%.*]], i64 0, i64 1 +; AVX2-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 +; AVX2-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 +; AVX2-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 +; AVX2-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 +; AVX2-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 +; AVX2-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 +; AVX2-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 +; AVX2-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 +; AVX2-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 +; AVX2-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 +; AVX2-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 +; AVX2-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 +; AVX2-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y]], align 2 +; AVX2-NEXT: [[TMP1:%.*]] = load i16, ptr [[X]], align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 +; AVX2-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 +; AVX2-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 +; AVX2-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 +; AVX2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i32 0 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP8]], i32 1 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP9]], i32 1 +; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP13]]) +; AVX2-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 +; AVX2-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 +; AVX2-NEXT: [[TMP17:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i32 0 +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP17]], i16 [[TMP15]], i32 1 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP16]], i32 1 +; AVX2-NEXT: [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP18]], <2 x i16> [[TMP20]]) +; AVX2-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2 +; AVX2-NEXT: [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <2 x i16> [[TMP24]], i16 [[TMP22]], i32 1 +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP26]], i16 [[TMP23]], i32 1 +; AVX2-NEXT: [[TMP28:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP25]], <2 x i16> [[TMP27]]) +; AVX2-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 +; AVX2-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 +; AVX2-NEXT: [[TMP31:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP32:%.*]] = insertelement <2 x i16> [[TMP31]], i16 [[TMP29]], i32 1 +; AVX2-NEXT: [[TMP33:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i32 0 +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <2 x i16> [[TMP33]], i16 [[TMP30]], i32 1 +; AVX2-NEXT: [[TMP35:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP32]], <2 x i16> [[TMP34]]) +; AVX2-NEXT: [[TMP36:%.*]] = zext <2 x i16> [[TMP35]] to <2 x i64> +; AVX2-NEXT: [[TMP37:%.*]] = shl nuw <2 x i64> [[TMP36]], +; AVX2-NEXT: [[TMP38:%.*]] = zext <2 x i16> [[TMP28]] to <2 x i64> +; AVX2-NEXT: [[TMP39:%.*]] = shl nuw <2 x i64> [[TMP38]], +; AVX2-NEXT: [[TMP40:%.*]] = or <2 x i64> [[TMP37]], [[TMP39]] +; AVX2-NEXT: [[TMP41:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64> +; AVX2-NEXT: [[TMP42:%.*]] = shl nuw nsw <2 x i64> [[TMP41]], +; AVX2-NEXT: [[TMP43:%.*]] = or <2 x i64> [[TMP40]], [[TMP42]] +; AVX2-NEXT: [[TMP44:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64> +; AVX2-NEXT: [[TMP45:%.*]] = or <2 x i64> [[TMP43]], [[TMP44]] +; AVX2-NEXT: [[TMP46:%.*]] = extractelement <2 x i64> [[TMP45]], i32 0 +; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP46]], 0 +; AVX2-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP45]], i32 1 +; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP47]], 1 +; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] +; entry: %0 = load i16, ptr %y, align 2 %1 = load i16, ptr %x, align 2