diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5536,10 +5536,15 @@ } unsigned BoUpSLP::getVectorElementSize(Value *V) { - // If V is a store, just return the width of the stored value without - // traversing the expression tree. This is the common case. - if (auto *Store = dyn_cast(V)) - return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + // If V is a store, just return the width of the stored value (or value + // truncated just before storing) without traversing the expression tree. + // This is the common case. + if (auto *Store = dyn_cast(V)) { + if (auto *Trunc = dyn_cast(Store->getValueOperand())) + return DL->getTypeSizeInBits(Trunc->getSrcTy()); + else + return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + } auto E = InstrElementSize.find(V); if (E != InstrElementSize.end()) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll @@ -22,132 +22,304 @@ } define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 { -; CHECK-LABEL: @bar( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1 -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2 -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3 -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3 -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3 -; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3 -; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4 -; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4 -; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4 -; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4 -; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5 -; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5 -; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5 -; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5 -; CHECK-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6 -; CHECK-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6 -; CHECK-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6 -; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6 -; CHECK-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6 -; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7 -; CHECK-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7 -; CHECK-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7 -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7 -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7 -; CHECK-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8 -; CHECK-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8 -; CHECK-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8 -; CHECK-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8 -; CHECK-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8 -; CHECK-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9 -; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9 -; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9 -; CHECK-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9 -; CHECK-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9 -; CHECK-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10 -; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10 -; CHECK-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10 -; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10 -; CHECK-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10 -; CHECK-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11 -; CHECK-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11 -; CHECK-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11 -; CHECK-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11 -; CHECK-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11 -; CHECK-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12 -; CHECK-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12 -; CHECK-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12 -; CHECK-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12 -; CHECK-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12 -; CHECK-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13 -; CHECK-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13 -; CHECK-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13 -; CHECK-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13 -; CHECK-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13 -; CHECK-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14 -; CHECK-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14 -; CHECK-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14 -; CHECK-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14 -; CHECK-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14 -; CHECK-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>* -; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]] -; CHECK-NEXT: [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]] -; CHECK-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8> -; CHECK-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1 -; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16 -; CHECK-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16 -; CHECK-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16 -; CHECK-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16 -; CHECK-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void +; SSE-LABEL: @bar( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[W:%.*]], i32 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[W]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[W]], i32 2 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[W]], i32 3 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[W]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[W]], i32 1 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[W]], i32 2 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[W]], i32 3 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[W]], i32 1 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[W]], i32 2 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[W]], i32 3 +; SSE-NEXT: br label [[FOR_BODY:%.*]] +; SSE: for.body: +; SSE-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1 +; SSE-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1 +; SSE-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1 +; SSE-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1 +; SSE-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1 +; SSE-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2 +; SSE-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2 +; SSE-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2 +; SSE-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2 +; SSE-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2 +; SSE-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3 +; SSE-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>* +; SSE-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 +; SSE-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3 +; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>* +; SSE-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 +; SSE-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3 +; SSE-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>* +; SSE-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 +; SSE-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3 +; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>* +; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 +; SSE-NEXT: [[TMP24:%.*]] = icmp ult <4 x i8> [[TMP17]], [[TMP19]] +; SSE-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i8> [[TMP23]], <4 x i8> [[TMP21]] +; SSE-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> +; SSE-NEXT: [[TMP27:%.*]] = mul <4 x i32> [[TMP26]], [[TMP3]] +; SSE-NEXT: [[TMP28:%.*]] = trunc <4 x i32> [[TMP27]] to <4 x i8> +; SSE-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3 +; SSE-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>* +; SSE-NEXT: store <4 x i8> [[TMP28]], <4 x i8>* [[TMP29]], align 1 +; SSE-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4 +; SSE-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4 +; SSE-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4 +; SSE-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4 +; SSE-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4 +; SSE-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5 +; SSE-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5 +; SSE-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5 +; SSE-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5 +; SSE-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5 +; SSE-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6 +; SSE-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6 +; SSE-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6 +; SSE-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6 +; SSE-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6 +; SSE-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7 +; SSE-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>* +; SSE-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1 +; SSE-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7 +; SSE-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>* +; SSE-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 +; SSE-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7 +; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>* +; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 +; SSE-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7 +; SSE-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>* +; SSE-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 +; SSE-NEXT: [[TMP38:%.*]] = icmp ult <4 x i8> [[TMP31]], [[TMP33]] +; SSE-NEXT: [[TMP39:%.*]] = select <4 x i1> [[TMP38]], <4 x i8> [[TMP37]], <4 x i8> [[TMP35]] +; SSE-NEXT: [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32> +; SSE-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP40]], [[TMP7]] +; SSE-NEXT: [[TMP42:%.*]] = trunc <4 x i32> [[TMP41]] to <4 x i8> +; SSE-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7 +; SSE-NEXT: [[TMP43:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>* +; SSE-NEXT: store <4 x i8> [[TMP42]], <4 x i8>* [[TMP43]], align 1 +; SSE-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8 +; SSE-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8 +; SSE-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8 +; SSE-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8 +; SSE-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8 +; SSE-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9 +; SSE-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9 +; SSE-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9 +; SSE-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9 +; SSE-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9 +; SSE-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10 +; SSE-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10 +; SSE-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10 +; SSE-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10 +; SSE-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10 +; SSE-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11 +; SSE-NEXT: [[TMP44:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>* +; SSE-NEXT: [[TMP45:%.*]] = load <4 x i8>, <4 x i8>* [[TMP44]], align 1 +; SSE-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11 +; SSE-NEXT: [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>* +; SSE-NEXT: [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1 +; SSE-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11 +; SSE-NEXT: [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>* +; SSE-NEXT: [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1 +; SSE-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11 +; SSE-NEXT: [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>* +; SSE-NEXT: [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1 +; SSE-NEXT: [[TMP52:%.*]] = icmp ult <4 x i8> [[TMP45]], [[TMP47]] +; SSE-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP52]], <4 x i8> [[TMP51]], <4 x i8> [[TMP49]] +; SSE-NEXT: [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32> +; SSE-NEXT: [[TMP55:%.*]] = mul <4 x i32> [[TMP54]], [[TMP11]] +; SSE-NEXT: [[TMP56:%.*]] = trunc <4 x i32> [[TMP55]] to <4 x i8> +; SSE-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11 +; SSE-NEXT: [[TMP57:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>* +; SSE-NEXT: store <4 x i8> [[TMP56]], <4 x i8>* [[TMP57]], align 1 +; SSE-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12 +; SSE-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12 +; SSE-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12 +; SSE-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12 +; SSE-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12 +; SSE-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13 +; SSE-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13 +; SSE-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13 +; SSE-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13 +; SSE-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13 +; SSE-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14 +; SSE-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14 +; SSE-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14 +; SSE-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14 +; SSE-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14 +; SSE-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15 +; SSE-NEXT: [[TMP58:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>* +; SSE-NEXT: [[TMP59:%.*]] = load <4 x i8>, <4 x i8>* [[TMP58]], align 1 +; SSE-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 +; SSE-NEXT: [[TMP60:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>* +; SSE-NEXT: [[TMP61:%.*]] = load <4 x i8>, <4 x i8>* [[TMP60]], align 1 +; SSE-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 +; SSE-NEXT: [[TMP62:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>* +; SSE-NEXT: [[TMP63:%.*]] = load <4 x i8>, <4 x i8>* [[TMP62]], align 1 +; SSE-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 +; SSE-NEXT: [[TMP64:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>* +; SSE-NEXT: [[TMP65:%.*]] = load <4 x i8>, <4 x i8>* [[TMP64]], align 1 +; SSE-NEXT: [[TMP66:%.*]] = icmp ult <4 x i8> [[TMP59]], [[TMP61]] +; SSE-NEXT: [[TMP67:%.*]] = select <4 x i1> [[TMP66]], <4 x i8> [[TMP65]], <4 x i8> [[TMP63]] +; SSE-NEXT: [[TMP68:%.*]] = zext <4 x i8> [[TMP67]] to <4 x i32> +; SSE-NEXT: [[TMP69:%.*]] = mul <4 x i32> [[TMP68]], [[TMP15]] +; SSE-NEXT: [[TMP70:%.*]] = trunc <4 x i32> [[TMP69]] to <4 x i8> +; SSE-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 +; SSE-NEXT: [[TMP71:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>* +; SSE-NEXT: store <4 x i8> [[TMP70]], <4 x i8>* [[TMP71]], align 1 +; SSE-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1 +; SSE-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16 +; SSE-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16 +; SSE-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16 +; SSE-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16 +; SSE-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16 +; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8 +; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; SSE: for.end: +; SSE-NEXT: ret void +; +; AVX512-LABEL: @bar( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4 +; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5 +; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6 +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7 +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8 +; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9 +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11 +; AVX512-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12 +; AVX512-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13 +; AVX512-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14 +; AVX512-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15 +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1 +; AVX512-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1 +; AVX512-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1 +; AVX512-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1 +; AVX512-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1 +; AVX512-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2 +; AVX512-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2 +; AVX512-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2 +; AVX512-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2 +; AVX512-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2 +; AVX512-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3 +; AVX512-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3 +; AVX512-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3 +; AVX512-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3 +; AVX512-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3 +; AVX512-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4 +; AVX512-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4 +; AVX512-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4 +; AVX512-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4 +; AVX512-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4 +; AVX512-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5 +; AVX512-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5 +; AVX512-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5 +; AVX512-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5 +; AVX512-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5 +; AVX512-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6 +; AVX512-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6 +; AVX512-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6 +; AVX512-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6 +; AVX512-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6 +; AVX512-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7 +; AVX512-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7 +; AVX512-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7 +; AVX512-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7 +; AVX512-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7 +; AVX512-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8 +; AVX512-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8 +; AVX512-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8 +; AVX512-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8 +; AVX512-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8 +; AVX512-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9 +; AVX512-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9 +; AVX512-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9 +; AVX512-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9 +; AVX512-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9 +; AVX512-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10 +; AVX512-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10 +; AVX512-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10 +; AVX512-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10 +; AVX512-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10 +; AVX512-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11 +; AVX512-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11 +; AVX512-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11 +; AVX512-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11 +; AVX512-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11 +; AVX512-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12 +; AVX512-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12 +; AVX512-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12 +; AVX512-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12 +; AVX512-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12 +; AVX512-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13 +; AVX512-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13 +; AVX512-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13 +; AVX512-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13 +; AVX512-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13 +; AVX512-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14 +; AVX512-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14 +; AVX512-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14 +; AVX512-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14 +; AVX512-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14 +; AVX512-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>* +; AVX512-NEXT: [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1 +; AVX512-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>* +; AVX512-NEXT: [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1 +; AVX512-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 +; AVX512-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>* +; AVX512-NEXT: [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1 +; AVX512-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 +; AVX512-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>* +; AVX512-NEXT: [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1 +; AVX512-NEXT: [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]] +; AVX512-NEXT: [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]] +; AVX512-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32> +; AVX512-NEXT: [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]] +; AVX512-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8> +; AVX512-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 +; AVX512-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>* +; AVX512-NEXT: store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1 +; AVX512-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1 +; AVX512-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16 +; AVX512-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16 +; AVX512-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16 +; AVX512-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16 +; AVX512-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16 +; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; AVX512: for.end: +; AVX512-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -51,41 +51,18 @@ define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i8( -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP0:%.*]], align 1, [[TBAA4:!tbaa !.*]] -; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP1:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 15 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 255 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP7]], i32 255 -; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[TMP9]] to i8 -; CHECK-NEXT: store i8 [[TMP10]], i8* [[TMP0]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP11]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP1]] -; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = icmp ult i32 [[TMP15]], 255 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP15]], i32 255 -; CHECK-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8 -; CHECK-NEXT: store i8 [[TMP18]], i8* [[TMP11]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 2 -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP19]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP21:%.*]] = zext i8 [[TMP20]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP1]] -; CHECK-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 15 -; CHECK-NEXT: [[TMP24:%.*]] = icmp ult i32 [[TMP23]], 255 -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 255 -; CHECK-NEXT: [[TMP26:%.*]] = trunc i32 [[TMP25]] to i8 -; CHECK-NEXT: store i8 [[TMP26]], i8* [[TMP19]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[TMP27]], align 1, [[TBAA4]] -; CHECK-NEXT: [[TMP29:%.*]] = zext i8 [[TMP28]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], [[TMP1]] -; CHECK-NEXT: [[TMP31:%.*]] = lshr i32 [[TMP30]], 15 -; CHECK-NEXT: [[TMP32:%.*]] = icmp ult i32 [[TMP31]], 255 -; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 255 -; CHECK-NEXT: [[TMP34:%.*]] = trunc i32 [[TMP33]] to i8 -; CHECK-NEXT: store i8 [[TMP34]], i8* [[TMP27]], align 1, [[TBAA4]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]] +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]] ; CHECK-NEXT: ret void ; %4 = load i8, i8* %0, align 1, !tbaa !6