diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5761,8 +5761,27 @@ } }; if (E->State == TreeEntry::NeedToGather) { - if (allConstant(VL)) - return 0; + if (allConstant(VL)) { + if (all_of(VL, [](Value *V) { + if (isa(V)) + return true; + auto *C = cast(V); + return C->isZeroValue(); + })) + return 0; + // Check if we have same buildvector already. + if (any_of(VectorizableTree, [E, + VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && TE.get() != E && + TE->Idx > E->Idx && TE->isSame(VL); + })) + return 0; + // Usually, constant buildvector results in a vector load from a + // constant/data pool. + return TTI->getMemoryOpCost(Instruction::Load, FinalVecTy, + DL->getABITypeAlign(FinalVecTy), + /*AddressSpace=*/0, CostKind); + } if (isa(VL[0])) return InstructionCost::getInvalid(); SmallVector Mask; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,14 +10,14 @@ ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-7' +; REMARK-NEXT: - Cost: '-4' ; ; REMARK-NOT: Function: gather_load define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @gather_multiple_use( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1237,105 +1237,421 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P1:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[P2:%.*]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV2]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 [[CONV4]], [[CONV6]] +; CHECK-NEXT: [[SHL:%.*]] = shl nsw i32 [[SUB7]], 16 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SHL]], [[SUB]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP5]] to i32 +; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 [[CONV9]], [[CONV11]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX13]], align 1 +; CHECK-NEXT: [[CONV14:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 +; CHECK-NEXT: [[CONV16:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[SUB17:%.*]] = sub nsw i32 [[CONV14]], [[CONV16]] +; CHECK-NEXT: [[SHL18:%.*]] = shl nsw i32 [[SUB17]], 16 +; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[SHL18]], [[SUB12]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[CONV21:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 +; CHECK-NEXT: [[CONV23:%.*]] = zext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[SUB24:%.*]] = sub nsw i32 [[CONV21]], [[CONV23]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX25]], align 1 +; CHECK-NEXT: [[CONV26:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 +; CHECK-NEXT: [[CONV28:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[SUB29:%.*]] = sub nsw i32 [[CONV26]], [[CONV28]] +; CHECK-NEXT: [[SHL30:%.*]] = shl nsw i32 [[SUB29]], 16 +; CHECK-NEXT: [[ADD31:%.*]] = add nsw i32 [[SHL30]], [[SUB24]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 +; CHECK-NEXT: [[CONV35:%.*]] = zext i8 [[TMP13]] to i32 +; CHECK-NEXT: [[SUB36:%.*]] = sub nsw i32 [[CONV33]], [[CONV35]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX37]], align 1 +; CHECK-NEXT: [[CONV38:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 +; CHECK-NEXT: [[CONV40:%.*]] = zext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[SUB41:%.*]] = sub nsw i32 [[CONV38]], [[CONV40]] +; CHECK-NEXT: [[SHL42:%.*]] = shl nsw i32 [[SUB41]], 16 +; CHECK-NEXT: [[ADD43:%.*]] = add nsw i32 [[SHL42]], [[SUB36]] +; CHECK-NEXT: [[ADD44:%.*]] = add nsw i32 [[ADD19]], [[ADD]] +; CHECK-NEXT: [[SUB45:%.*]] = sub nsw i32 [[ADD]], [[ADD19]] +; CHECK-NEXT: [[ADD46:%.*]] = add nsw i32 [[ADD43]], [[ADD31]] +; CHECK-NEXT: [[SUB47:%.*]] = sub nsw i32 [[ADD31]], [[ADD43]] +; CHECK-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD46]], [[ADD44]] +; CHECK-NEXT: [[SUB51:%.*]] = sub nsw i32 [[ADD44]], [[ADD46]] +; CHECK-NEXT: [[ADD55:%.*]] = add nsw i32 [[SUB47]], [[SUB45]] +; CHECK-NEXT: [[SUB59:%.*]] = sub nsw i32 [[SUB45]], [[SUB47]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[CONV_1]], [[CONV2_1]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 [[CONV4_1]], [[CONV6_1]] +; CHECK-NEXT: [[SHL_1:%.*]] = shl nsw i32 [[SUB7_1]], 16 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[SHL_1]], [[SUB_1]] +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 +; CHECK-NEXT: [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32 +; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 [[CONV9_1]], [[CONV11_1]] +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[ARRAYIDX13_1]], align 1 +; CHECK-NEXT: [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 +; CHECK-NEXT: [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[SUB17_1:%.*]] = sub nsw i32 [[CONV14_1]], [[CONV16_1]] +; CHECK-NEXT: [[SHL18_1:%.*]] = shl nsw i32 [[SUB17_1]], 16 +; CHECK-NEXT: [[ADD19_1:%.*]] = add nsw i32 [[SHL18_1]], [[SUB12_1]] +; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[ARRAYIDX20_1]], align 1 +; CHECK-NEXT: [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[SUB24_1:%.*]] = sub nsw i32 [[CONV21_1]], [[CONV23_1]] +; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[ARRAYIDX25_1]], align 1 +; CHECK-NEXT: [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 +; CHECK-NEXT: [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[SUB29_1:%.*]] = sub nsw i32 [[CONV26_1]], [[CONV28_1]] +; CHECK-NEXT: [[SHL30_1:%.*]] = shl nsw i32 [[SUB29_1]], 16 +; CHECK-NEXT: [[ADD31_1:%.*]] = add nsw i32 [[SHL30_1]], [[SUB24_1]] +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 +; CHECK-NEXT: [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[SUB36_1:%.*]] = sub nsw i32 [[CONV33_1]], [[CONV35_1]] +; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX37_1]], align 1 +; CHECK-NEXT: [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 +; CHECK-NEXT: [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[SUB41_1:%.*]] = sub nsw i32 [[CONV38_1]], [[CONV40_1]] +; CHECK-NEXT: [[SHL42_1:%.*]] = shl nsw i32 [[SUB41_1]], 16 +; CHECK-NEXT: [[ADD43_1:%.*]] = add nsw i32 [[SHL42_1]], [[SUB36_1]] +; CHECK-NEXT: [[ADD44_1:%.*]] = add nsw i32 [[ADD19_1]], [[ADD_1]] +; CHECK-NEXT: [[SUB45_1:%.*]] = sub nsw i32 [[ADD_1]], [[ADD19_1]] +; CHECK-NEXT: [[ADD46_1:%.*]] = add nsw i32 [[ADD43_1]], [[ADD31_1]] +; CHECK-NEXT: [[SUB47_1:%.*]] = sub nsw i32 [[ADD31_1]], [[ADD43_1]] +; CHECK-NEXT: [[ADD48_1:%.*]] = add nsw i32 [[ADD46_1]], [[ADD44_1]] +; CHECK-NEXT: [[SUB51_1:%.*]] = sub nsw i32 [[ADD44_1]], [[ADD46_1]] +; CHECK-NEXT: [[ADD55_1:%.*]] = add nsw i32 [[SUB47_1]], [[SUB45_1]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub nsw i32 [[SUB45_1]], [[SUB47_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[CONV_2]], [[CONV2_2]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32 +; CHECK-NEXT: [[SUB7_2:%.*]] = sub nsw i32 [[CONV4_2]], [[CONV6_2]] +; CHECK-NEXT: [[SHL_2:%.*]] = shl nsw i32 [[SUB7_2]], 16 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[SHL_2]], [[SUB_2]] +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX10_2]], align 1 +; CHECK-NEXT: [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32 +; CHECK-NEXT: [[SUB12_2:%.*]] = sub nsw i32 [[CONV9_2]], [[CONV11_2]] +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, i8* [[ARRAYIDX13_2]], align 1 +; CHECK-NEXT: [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 5 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, i8* [[ARRAYIDX15_2]], align 1 +; CHECK-NEXT: [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32 +; CHECK-NEXT: [[SUB17_2:%.*]] = sub nsw i32 [[CONV14_2]], [[CONV16_2]] +; CHECK-NEXT: [[SHL18_2:%.*]] = shl nsw i32 [[SUB17_2]], 16 +; CHECK-NEXT: [[ADD19_2:%.*]] = add nsw i32 [[SHL18_2]], [[SUB12_2]] +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP40:%.*]] = load i8, i8* [[ARRAYIDX20_2]], align 1 +; CHECK-NEXT: [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32 +; CHECK-NEXT: [[SUB24_2:%.*]] = sub nsw i32 [[CONV21_2]], [[CONV23_2]] +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, i8* [[ARRAYIDX25_2]], align 1 +; CHECK-NEXT: [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, i8* [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32 +; CHECK-NEXT: [[SUB29_2:%.*]] = sub nsw i32 [[CONV26_2]], [[CONV28_2]] +; CHECK-NEXT: [[SHL30_2:%.*]] = shl nsw i32 [[SUB29_2]], 16 +; CHECK-NEXT: [[ADD31_2:%.*]] = add nsw i32 [[SHL30_2]], [[SUB24_2]] +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX32_2]], align 1 +; CHECK-NEXT: [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 3 +; CHECK-NEXT: [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX34_2]], align 1 +; CHECK-NEXT: [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32 +; CHECK-NEXT: [[SUB36_2:%.*]] = sub nsw i32 [[CONV33_2]], [[CONV35_2]] +; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP46:%.*]] = load i8, i8* [[ARRAYIDX37_2]], align 1 +; CHECK-NEXT: [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, i8* [[ARRAYIDX39_2]], align 1 +; CHECK-NEXT: [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32 +; CHECK-NEXT: [[SUB41_2:%.*]] = sub nsw i32 [[CONV38_2]], [[CONV40_2]] +; CHECK-NEXT: [[SHL42_2:%.*]] = shl nsw i32 [[SUB41_2]], 16 +; CHECK-NEXT: [[ADD43_2:%.*]] = add nsw i32 [[SHL42_2]], [[SUB36_2]] +; CHECK-NEXT: [[ADD44_2:%.*]] = add nsw i32 [[ADD19_2]], [[ADD_2]] +; CHECK-NEXT: [[SUB45_2:%.*]] = sub nsw i32 [[ADD_2]], [[ADD19_2]] +; CHECK-NEXT: [[ADD46_2:%.*]] = add nsw i32 [[ADD43_2]], [[ADD31_2]] +; CHECK-NEXT: [[SUB47_2:%.*]] = sub nsw i32 [[ADD31_2]], [[ADD43_2]] +; CHECK-NEXT: [[ADD48_2:%.*]] = add nsw i32 [[ADD46_2]], [[ADD44_2]] +; CHECK-NEXT: [[SUB51_2:%.*]] = sub nsw i32 [[ADD44_2]], [[ADD46_2]] +; CHECK-NEXT: [[ADD55_2:%.*]] = add nsw i32 [[SUB47_2]], [[SUB45_2]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub nsw i32 [[SUB45_2]], [[SUB47_2]] ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP48:%.*]] = load i8, i8* [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 +; CHECK-NEXT: [[TMP49:%.*]] = load i8, i8* [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32 +; CHECK-NEXT: [[SUB_3:%.*]] = sub nsw i32 [[CONV_3]], [[CONV2_3]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP50:%.*]] = load i8, i8* [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, <4 x i8>* [[TMP12]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX3_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP37]], <4 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x i8> [[TMP38]], <16 x i8> [[TMP39]], <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i8> [[TMP44]], <16 x i8> [[TMP45]], <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i8> [[TMP46]], <16 x i8> [[TMP47]], <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i8> [[TMP50]], <16 x i8> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP49]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP29]], [[TMP43]] -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP51:%.*]] = load i8, i8* [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32 +; CHECK-NEXT: [[SUB7_3:%.*]] = sub nsw i32 [[CONV4_3]], [[CONV6_3]] +; CHECK-NEXT: [[SHL_3:%.*]] = shl nsw i32 [[SUB7_3]], 16 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[SHL_3]], [[SUB_3]] +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i8, i8* [[ARRAYIDX8_3]], align 1 +; CHECK-NEXT: [[CONV9_3:%.*]] = zext i8 [[TMP52]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 1 +; CHECK-NEXT: [[TMP53:%.*]] = load i8, i8* [[ARRAYIDX10_3]], align 1 +; CHECK-NEXT: [[CONV11_3:%.*]] = zext i8 [[TMP53]] to i32 +; CHECK-NEXT: [[SUB12_3:%.*]] = sub nsw i32 [[CONV9_3]], [[CONV11_3]] +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP54:%.*]] = load i8, i8* [[ARRAYIDX13_3]], align 1 +; CHECK-NEXT: [[CONV14_3:%.*]] = zext i8 [[TMP54]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 5 +; CHECK-NEXT: [[TMP55:%.*]] = load i8, i8* [[ARRAYIDX15_3]], align 1 +; CHECK-NEXT: [[CONV16_3:%.*]] = zext i8 [[TMP55]] to i32 +; CHECK-NEXT: [[SUB17_3:%.*]] = sub nsw i32 [[CONV14_3]], [[CONV16_3]] +; CHECK-NEXT: [[SHL18_3:%.*]] = shl nsw i32 [[SUB17_3]], 16 +; CHECK-NEXT: [[ADD19_3:%.*]] = add nsw i32 [[SHL18_3]], [[SUB12_3]] +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP56:%.*]] = load i8, i8* [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[CONV21_3:%.*]] = zext i8 [[TMP56]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 2 +; CHECK-NEXT: [[TMP57:%.*]] = load i8, i8* [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[CONV23_3:%.*]] = zext i8 [[TMP57]] to i32 +; CHECK-NEXT: [[SUB24_3:%.*]] = sub nsw i32 [[CONV21_3]], [[CONV23_3]] +; CHECK-NEXT: [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP58:%.*]] = load i8, i8* [[ARRAYIDX25_3]], align 1 +; CHECK-NEXT: [[CONV26_3:%.*]] = zext i8 [[TMP58]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 6 +; CHECK-NEXT: [[TMP59:%.*]] = load i8, i8* [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[CONV28_3:%.*]] = zext i8 [[TMP59]] to i32 +; CHECK-NEXT: [[SUB29_3:%.*]] = sub nsw i32 [[CONV26_3]], [[CONV28_3]] +; CHECK-NEXT: [[SHL30_3:%.*]] = shl nsw i32 [[SUB29_3]], 16 +; CHECK-NEXT: [[ADD31_3:%.*]] = add nsw i32 [[SHL30_3]], [[SUB24_3]] +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP60:%.*]] = load i8, i8* [[ARRAYIDX32_3]], align 1 +; CHECK-NEXT: [[CONV33_3:%.*]] = zext i8 [[TMP60]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 3 +; CHECK-NEXT: [[TMP61:%.*]] = load i8, i8* [[ARRAYIDX34_3]], align 1 +; CHECK-NEXT: [[CONV35_3:%.*]] = zext i8 [[TMP61]] to i32 +; CHECK-NEXT: [[SUB36_3:%.*]] = sub nsw i32 [[CONV33_3]], [[CONV35_3]] +; CHECK-NEXT: [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP62:%.*]] = load i8, i8* [[ARRAYIDX37_3]], align 1 +; CHECK-NEXT: [[CONV38_3:%.*]] = zext i8 [[TMP62]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 7 +; CHECK-NEXT: [[TMP63:%.*]] = load i8, i8* [[ARRAYIDX39_3]], align 1 +; CHECK-NEXT: [[CONV40_3:%.*]] = zext i8 [[TMP63]] to i32 +; CHECK-NEXT: [[SUB41_3:%.*]] = sub nsw i32 [[CONV38_3]], [[CONV40_3]] +; CHECK-NEXT: [[SHL42_3:%.*]] = shl nsw i32 [[SUB41_3]], 16 +; CHECK-NEXT: [[ADD43_3:%.*]] = add nsw i32 [[SHL42_3]], [[SUB36_3]] +; CHECK-NEXT: [[ADD44_3:%.*]] = add nsw i32 [[ADD19_3]], [[ADD_3]] +; CHECK-NEXT: [[SUB45_3:%.*]] = sub nsw i32 [[ADD_3]], [[ADD19_3]] +; CHECK-NEXT: [[ADD46_3:%.*]] = add nsw i32 [[ADD43_3]], [[ADD31_3]] +; CHECK-NEXT: [[SUB47_3:%.*]] = sub nsw i32 [[ADD31_3]], [[ADD43_3]] +; CHECK-NEXT: [[ADD48_3:%.*]] = add nsw i32 [[ADD46_3]], [[ADD44_3]] +; CHECK-NEXT: [[SUB51_3:%.*]] = sub nsw i32 [[ADD44_3]], [[ADD46_3]] +; CHECK-NEXT: [[ADD55_3:%.*]] = add nsw i32 [[SUB47_3]], [[SUB45_3]] +; CHECK-NEXT: [[SUB59_3:%.*]] = sub nsw i32 [[SUB45_3]], [[SUB47_3]] +; CHECK-NEXT: [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]] +; CHECK-NEXT: [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]] +; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]] +; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]] +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] +; CHECK-NEXT: [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15 +; CHECK-NEXT: [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537 +; CHECK-NEXT: [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535 +; CHECK-NEXT: [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]] +; CHECK-NEXT: [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]] +; CHECK-NEXT: [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15 +; CHECK-NEXT: [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537 +; CHECK-NEXT: [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535 +; CHECK-NEXT: [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]] +; CHECK-NEXT: [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]] +; CHECK-NEXT: [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15 +; CHECK-NEXT: [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537 +; CHECK-NEXT: [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535 +; CHECK-NEXT: [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]] +; CHECK-NEXT: [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]] +; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]] +; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]] +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]] +; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]] +; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]] +; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]] +; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 +; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 +; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] +; CHECK-NEXT: [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15 +; CHECK-NEXT: [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537 +; CHECK-NEXT: [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535 +; CHECK-NEXT: [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]] +; CHECK-NEXT: [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15 +; CHECK-NEXT: [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537 +; CHECK-NEXT: [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535 +; CHECK-NEXT: [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]] +; CHECK-NEXT: [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15 +; CHECK-NEXT: [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537 +; CHECK-NEXT: [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535 +; CHECK-NEXT: [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]] +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]] +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]] +; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 [[SUB51_1]], [[SUB51]] +; CHECK-NEXT: [[SUB86_2:%.*]] = sub nsw i32 [[SUB51]], [[SUB51_1]] +; CHECK-NEXT: [[ADD94_2:%.*]] = add nsw i32 [[SUB51_3]], [[SUB51_2]] +; CHECK-NEXT: [[SUB102_2:%.*]] = sub nsw i32 [[SUB51_2]], [[SUB51_3]] +; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 [[ADD94_2]], [[ADD78_2]] +; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], [[ADD94_2]] +; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 [[SUB102_2]], [[SUB86_2]] +; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 [[SUB86_2]], [[SUB102_2]] +; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 +; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 +; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 +; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] +; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] +; CHECK-NEXT: [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15 +; CHECK-NEXT: [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537 +; CHECK-NEXT: [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535 +; CHECK-NEXT: [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]] +; CHECK-NEXT: [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]] +; CHECK-NEXT: [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15 +; CHECK-NEXT: [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537 +; CHECK-NEXT: [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535 +; CHECK-NEXT: [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]] +; CHECK-NEXT: [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]] +; CHECK-NEXT: [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15 +; CHECK-NEXT: [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537 +; CHECK-NEXT: [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535 +; CHECK-NEXT: [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]] +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]] +; CHECK-NEXT: [[ADD78_3:%.*]] = add nsw i32 [[SUB59_1]], [[SUB59]] +; CHECK-NEXT: [[SUB86_3:%.*]] = sub nsw i32 [[SUB59]], [[SUB59_1]] +; CHECK-NEXT: [[ADD94_3:%.*]] = add nsw i32 [[SUB59_3]], [[SUB59_2]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 [[SUB59_2]], [[SUB59_3]] +; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]] +; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]] +; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], [[SUB86_3]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 [[SUB86_3]], [[SUB102_3]] +; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 +; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 +; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 +; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] +; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] +; CHECK-NEXT: [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15 +; CHECK-NEXT: [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537 +; CHECK-NEXT: [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535 +; CHECK-NEXT: [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]] +; CHECK-NEXT: [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15 +; CHECK-NEXT: [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537 +; CHECK-NEXT: [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535 +; CHECK-NEXT: [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]] +; CHECK-NEXT: [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]] +; CHECK-NEXT: [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15 +; CHECK-NEXT: [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537 +; CHECK-NEXT: [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535 +; CHECK-NEXT: [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]] +; CHECK-NEXT: [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]] +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]] +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -607,13 +607,24 @@ define void @test_bounds_removed_before_runtime_checks(%struct * %A, i32** %B, i1 %c) { ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01 +; CHECK-NEXT: [[TMP4:%.*]] = fptosi float [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT]], %struct* [[A]], i64 0, i32 1 +; CHECK-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = load i32*, i32** [[B:%.*]], align 8 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]] ; CHECK: bb14: -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 10 to i64 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 2, [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to i8* diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll @@ -12,11 +12,13 @@ define void @foo(i64* nocapture writeonly %da) { ; CHECK-128-LABEL: @foo( ; CHECK-128-NEXT: entry: -; CHECK-128-NEXT: [[TMP0:%.*]] = bitcast i64* [[DA:%.*]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP0]], align 8 +; CHECK-128-NEXT: store i64 0, i64* [[DA:%.*]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 1 +; CHECK-128-NEXT: store i64 1, i64* [[ARRAYIDX1]], align 8 ; CHECK-128-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 2 -; CHECK-128-NEXT: [[TMP1:%.*]] = bitcast i64* [[ARRAYIDX2]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP1]], align 8 +; CHECK-128-NEXT: store i64 2, i64* [[ARRAYIDX2]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 3 +; CHECK-128-NEXT: store i64 3, i64* [[ARRAYIDX3]], align 8 ; CHECK-128-NEXT: ret void ; ; CHECK-256-LABEL: @foo( @@ -45,8 +47,9 @@ define void @foo8(i8* nocapture writeonly %da) { ; CHECK-LABEL: @foo8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[DA:%.*]] to <2 x i8>* -; CHECK-NEXT: store <2 x i8> , <2 x i8>* [[TMP0]], align 8 +; CHECK-NEXT: store i8 0, i8* [[DA:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 1 +; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -7,20 +7,27 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 +; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[LAST:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[FORK:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INC1:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[INC2:%.*]] = add i64 [[TMP0]], 2 +; CHECK-NEXT: [[INC11:%.*]] = add i64 1, [[INC1]] +; CHECK-NEXT: [[EXACT1:%.*]] = ashr exact i64 [[INC11]], 32 +; CHECK-NEXT: [[INC3:%.*]] = add i64 [[TMP0]], 3 ; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> , [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP3]], 0 -; CHECK-NEXT: [[OP_RDX1]] = add i64 [[TMP7]], [[OP_RDX]] +; CHECK-NEXT: [[INC12:%.*]] = add i64 1, [[INC2]] +; CHECK-NEXT: [[EXACT2:%.*]] = ashr exact i64 [[INC12]], 32 +; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[INC3]], 32 +; CHECK-NEXT: [[INC13:%.*]] = add i64 1, [[INC3]] +; CHECK-NEXT: [[EXACT3:%.*]] = ashr exact i64 [[INC13]], 32 +; CHECK-NEXT: [[FORK]] = add i64 [[TMP0]], 0 +; CHECK-NEXT: [[SUM1:%.*]] = add i64 [[EXACT3]], [[EXACT2]] +; CHECK-NEXT: [[SUM2:%.*]] = add i64 [[SUM1]], [[EXACT1]] +; CHECK-NEXT: [[ZSUM:%.*]] = add i64 [[SUM2]], 0 +; CHECK-NEXT: [[SEXT22:%.*]] = add i64 1, [[FORK]] +; CHECK-NEXT: [[EXACT4:%.*]] = ashr exact i64 [[SEXT22]], 32 +; CHECK-NEXT: [[JOIN:%.*]] = add i64 [[FORK]], [[ZSUM]] +; CHECK-NEXT: [[LAST]] = add i64 [[JOIN]], [[EXACT4]] ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -97,19 +97,21 @@ ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer ; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX512-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer -; AVX512-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; AVX512-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; AVX512-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; AVX512-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; AVX512-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 ; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX512-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0 +; AVX512-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 ; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]] -; AVX512-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; AVX512-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]] +; AVX512-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] ; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] ; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] @@ -264,20 +266,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -389,20 +393,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -556,20 +562,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,18 +14,23 @@ ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 0, 1 +; CHECK-NEXT: store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_1:%.*]] = add nsw i32 6, -1 +; CHECK-NEXT: store i32 [[DEC_1]], i32* [[NUB5]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: store i32 [[DEC_2]], i32* [[NUB5]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -1,47 +1,84 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s +; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s --check-prefix=AVX target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { -; CHECK-LABEL: @testfunc( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> -; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> -; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: for.end: -; CHECK-NEXT: ret void +; SSE-LABEL: @testfunc( +; SSE-NEXT: entry: +; SSE-NEXT: br label [[FOR_BODY:%.*]] +; SSE: for.body: +; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] +; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] +; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 +; SSE-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP1]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP3]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = fcmp olt <2 x float> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x float> [[TMP7]], <2 x float> +; SSE-NEXT: [[TMP10:%.*]] = fcmp olt <2 x float> [[TMP9]], +; SSE-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP9]], zeroinitializer +; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP10]], <2 x float> , <2 x float> [[TMP11]] +; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; SSE-NEXT: [[ADD13]] = fadd float [[TMP13]], [[TMP14]] +; SSE-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[ADD13]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = fcmp olt <2 x float> [[TMP16]], +; SSE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP17]], <2 x float> [[TMP16]], <2 x float> +; SSE-NEXT: [[TMP19:%.*]] = fcmp olt <2 x float> [[TMP18]], +; SSE-NEXT: [[TMP20]] = select <2 x i1> [[TMP19]], <2 x float> , <2 x float> [[TMP18]] +; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; SSE: for.end: +; SSE-NEXT: ret void +; +; AVX-LABEL: @testfunc( +; AVX-NEXT: entry: +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; AVX: for.end: +; AVX-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll @@ -11,24 +11,25 @@ ; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: +; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04 ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[T_0259]], i32 0 +; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[P3_ADDR_0258]], i32 0 +; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]] +; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll @@ -11,24 +11,25 @@ ; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: +; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04 ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0 +; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0 +; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]] +; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -27,24 +27,25 @@ ; CHECK: land.rhs.lr.ph: ; CHECK-NEXT: unreachable ; CHECK: if.end98: +; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] ; CHECK: if.then103: -; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef ; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[COND125]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ] +; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] +; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] ; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] ; CHECK: land.rhs.i874: ; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] ; CHECK: for.end.i: ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] ; CHECK: if.then.i: -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef +; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef +; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef ; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] ; CHECK: if.end.i: ; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] @@ -65,12 +66,14 @@ ; CHECK: while.end275.i: ; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] ; CHECK: extend_bw.exit: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] ; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] ; CHECK: if.then157: -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 +; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 +; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 +; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 ; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] ; CHECK: land.lhs.true167: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: br i1 undef, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]] ; CHECK: arrayctor.cont: ; CHECK-NEXT: [[AGG_TMP99208_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416:%.*]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 0 +; CHECK-NEXT: [[AGG_TMP99208_SROA_1_8_IDX388:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 1 ; CHECK-NEXT: [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: br label [[FOR_COND36_PREHEADER:%.*]] ; CHECK: for.cond36.preheader: @@ -32,17 +33,17 @@ ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double [[ADD_I276_US]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> , [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> , double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[MUL_I254_US:%.*]] = fmul double [[TMP2]], 1.400000e+02 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[MUL2_I256_US:%.*]] = fmul double [[TMP3]], 1.400000e+02 +; CHECK-NEXT: [[ADD_I243_US:%.*]] = fadd double [[MUL_I254_US]], 5.000000e+01 +; CHECK-NEXT: [[ADD4_I246_US:%.*]] = fadd double [[MUL2_I256_US]], 5.200000e+01 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP1]] +; CHECK-NEXT: store double [[ADD_I243_US]], double* [[AGG_TMP99208_SROA_0_0_IDX]], align 8 +; CHECK-NEXT: store double [[ADD4_I246_US]], double* [[AGG_TMP99208_SROA_1_8_IDX388]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -22,14 +22,14 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; CHECK-NEXT: [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 +; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 +; CHECK-NEXT: store double [[ADD12]], double* [[ARRAYIDX13]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -122,29 +122,29 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5 ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP3]], 4.000000e+00 -; CHECK-NEXT: br i1 [[TMP1]], label [[TMP13:%.*]], label [[TMP5:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP12:%.*]], label [[TMP5:%.*]] ; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 -; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: br label [[TMP22:%.*]] -; CHECK: 13: +; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP4]], 1.000000e+00 +; CHECK-NEXT: store double [[TMP6]], double* [[G]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = load double, double* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = fmul double [[TMP8]], 3.000000e+00 +; CHECK-NEXT: [[TMP10:%.*]] = fadd double [[TMP9]], 6.000000e+00 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 +; CHECK-NEXT: store double [[TMP10]], double* [[TMP11]], align 8 +; CHECK-NEXT: br label [[TMP20:%.*]] +; CHECK: 12: +; CHECK-NEXT: [[TMP13:%.*]] = fadd double [[TMP4]], 7.000000e+00 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 +; CHECK-NEXT: store double [[TMP13]], double* [[TMP14]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP16:%.*]] = load double, double* [[TMP15]], align 8 ; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[TMP16]], 3.000000e+00 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x double> [[TMP19]], -; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP14]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP20]], <2 x double>* [[TMP21]], align 8 -; CHECK-NEXT: br label [[TMP22]] -; CHECK: 22: +; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 8.000000e+00 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 +; CHECK-NEXT: store double [[TMP18]], double* [[TMP19]], align 8 +; CHECK-NEXT: br label [[TMP20]] +; CHECK: 20: ; CHECK-NEXT: ret i32 undef ; %1 = icmp eq i32 %k, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -7,17 +7,23 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]] -; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[N:%.*]], 5 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 9 +; CHECK-NEXT: store i32 [[ADD]], i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[N]], 9 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[MUL1]], 9 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[MUL4:%.*]] = shl i32 [[N]], 3 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[MUL4]], 9 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: store i32 [[ADD5]], i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[N]], 10 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[MUL7]], 9 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[ADD]], [[M:%.*]] +; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[ADD]], [[M]] ; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]] ; CHECK-NEXT: ret i32 [[ADD10]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll @@ -11,13 +11,11 @@ define void @vectorize_fp128(fp128 %c, fp128 %d) #0 { ; CHECK-LABEL: @vectorize_fp128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x fp128> poison, fp128 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x fp128> [[TMP0]], fp128 [[D:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x fp128> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: [[OR_COND39:%.*]] = or i1 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call fp128 @llvm.fabs.f128(fp128 [[C:%.*]]) +; CHECK-NEXT: [[CMPINF10:%.*]] = fcmp oeq fp128 [[TMP0]], 0xL00000000000000007FFF000000000000 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fp128 @llvm.fabs.f128(fp128 [[D:%.*]]) +; CHECK-NEXT: [[CMPINF12:%.*]] = fcmp oeq fp128 [[TMP1]], 0xL00000000000000007FFF000000000000 +; CHECK-NEXT: [[OR_COND39:%.*]] = or i1 [[CMPINF10]], [[CMPINF12]] ; CHECK-NEXT: br i1 [[OR_COND39]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] ; CHECK: if.then13: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -12,35 +12,37 @@ ; CHECK-NEXT: br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[B_021:%.*]] = phi i32* [ [[B_021_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[A_020:%.*]] = phi i32* [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i32* [[C_022]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 1 +; CHECK-NEXT: switch i32 [[TMP2]], label [[WHILE_BODY_BACKEDGE]] [ ; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint i32* [[INCDEC_PTR2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: +; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[INCDEC_PTR9:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: ; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[B_021_BE]] = phi i32* [ [[INCDEC_PTR2]], [[WHILE_BODY]] ], [ [[INCDEC_PTR9]], [[SW_BB6]] ], [ [[INCDEC_PTR3]], [[SW_BB]] ] +; CHECK-NEXT: [[A_020_BE]] = phi i32* [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=AVX target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -13,21 +13,34 @@ ; zero-extend the roots back to their original sizes. ; define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { -; CHECK-LABEL: @PR31243_zext( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 -; CHECK-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 -; CHECK-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] -; CHECK-NEXT: ret i8 [[TMP_8]] +; SSE-LABEL: @PR31243_zext( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 +; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 +; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64 +; SSE-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64 +; SSE-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; SSE-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; SSE-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; SSE-NEXT: ret i8 [[TMP_8]] +; +; AVX-LABEL: @PR31243_zext( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 +; AVX-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 +; AVX-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] +; AVX-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; AVX-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; AVX-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; AVX-NEXT: ret i8 [[TMP_8]] ; entry: %tmp_0 = zext i8 %v0 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -67,49 +67,63 @@ ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; SSE-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 +; SSE-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 +; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; SSE-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 +; SSE-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 +; SSE-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; SSE-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] +; SSE-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 +; SSE-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; SSE-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 +; SSE-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 +; SSE-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 +; SSE-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 +; SSE-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] +; SSE-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; SSE-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 +; SSE-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 +; SSE-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] +; SSE-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( ; AVX-NEXT: entry: ; AVX-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; AVX-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 +; AVX-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef ; AVX-NEXT: store i64 [[ADD]], i64* undef, align 1 +; AVX-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; AVX-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 +; AVX-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 +; AVX-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 +; AVX-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] ; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; AVX-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 +; AVX-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] +; AVX-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 +; AVX-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; AVX-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; AVX-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 +; AVX-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 +; AVX-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 +; AVX-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 +; AVX-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] +; AVX-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 ; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], -; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], -; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; AVX-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 +; AVX-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 +; AVX-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] +; AVX-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 ; AVX-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2P +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2P define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( @@ -143,19 +143,58 @@ ; ; AVX-LABEL: @store_i64( ; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] -; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], -; AVX-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> -; AVX-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* -; AVX-NEXT: store <4 x i64> [[TMP13]], <4 x i64>* [[TMP14]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; AVX-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; AVX-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; AVX-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; AVX-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; AVX-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; AVX-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; AVX-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; AVX-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; AVX-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; AVX-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; AVX-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; AVX-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; AVX-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; AVX-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; AVX-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; AVX-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; AVX-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; AVX-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; AVX-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; AVX-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; AVX-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]] ; AVX-NEXT: ret void +; +; AVX2P-LABEL: @store_i64( +; AVX2P-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; AVX2P-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX2P-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX2P-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; AVX2P-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer +; AVX2P-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] +; AVX2P-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], +; AVX2P-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; AVX2P-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; AVX2P-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], +; AVX2P-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> +; AVX2P-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX2P-NEXT: store <4 x i64> [[TMP13]], <4 x i64>* [[TMP14]], align 8, !tbaa [[TBAA5]] +; AVX2P-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, i64* %0, align 8, !tbaa !7 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -227,15 +227,37 @@ ; logic...or a wide reduction? define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -260,17 +282,40 @@ } define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use1(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: call void @use1(i1 [[TMP2]]) +; SSE-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: call void @use1(i1 [[C2]]) +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -296,21 +341,44 @@ } define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false -; CHECK-NEXT: call void @use1(i1 [[S2]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX1]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false +; SSE-NEXT: call void @use1(i1 [[S2]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false +; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX1]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: call void @use1(i1 [[S2]]) +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -386,38 +454,20 @@ } define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_partial( -; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], -; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; SSE-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false -; SSE-NEXT: ret i1 [[OP_RDX2]] -; -; AVX-LABEL: @logical_and_icmp_clamp_partial( -; AVX-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 -; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false -; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false -; AVX-NEXT: ret i1 [[OP_RDX2]] +; CHECK-LABEL: @logical_and_icmp_clamp_partial( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false +; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX2]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -442,17 +492,39 @@ } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -12,19 +12,20 @@ ; CHECK-LABEL: @rotate_with_external_users( ; CHECK-NEXT: bb1: ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 +; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], -; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PTRA1]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[TMP5]], 3.300000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[ADD4:%.*]] = fadd double [[TMP6]], 4.400000e+00 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[ADD3]], [[ADD4]] ; CHECK-NEXT: ret void ; bb1: @@ -55,24 +56,30 @@ ; CHECK-LABEL: @non_consecutive_external_users( ; CHECK-NEXT: bb1: ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[LD]], 1.100000e+00 +; CHECK-NEXT: [[ADD6:%.*]] = fadd double [[LD]], 2.200000e+00 +; CHECK-NEXT: [[ADD7:%.*]] = fadd double [[LD]], 3.300000e+00 +; CHECK-NEXT: [[ADD8:%.*]] = fadd double [[LD]], 4.400000e+00 +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[ADD5]], 1.100000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[ADD6]], 2.200000e+00 +; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[ADD7]], 3.300000e+00 +; CHECK-NEXT: [[ADD4:%.*]] = fadd double [[ADD8]], 4.400000e+00 +; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[ADD1]], 1.100000e+00 +; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[ADD2]], 2.200000e+00 +; CHECK-NEXT: [[MUL3:%.*]] = fmul double [[ADD3]], 3.300000e+00 +; CHECK-NEXT: [[MUL4:%.*]] = fmul double [[ADD4]], 4.400000e+00 ; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[PTRA4:%.*]] = getelementptr inbounds double, double* [[A]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 -; CHECK-NEXT: store double [[TMP4]], double* [[PTRA1]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 -; CHECK-NEXT: store double [[TMP5]], double* [[PTRA1]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 -; CHECK-NEXT: store double [[TMP6]], double* [[PTRA4]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 -; CHECK-NEXT: store double [[TMP7]], double* [[PTRA4]], align 8 +; CHECK-NEXT: store double [[MUL4]], double* [[PTRA1]], align 8 +; CHECK-NEXT: store double [[MUL3]], double* [[PTRA1]], align 8 +; CHECK-NEXT: store double [[MUL2]], double* [[PTRA4]], align 8 +; CHECK-NEXT: store double [[MUL1]], double* [[PTRA4]], align 8 ; CHECK-NEXT: br label [[SEED_LOOP:%.*]] ; CHECK: seed_loop: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ] +; CHECK-NEXT: [[PHI1:%.*]] = phi double [ [[MUL1]], [[BB1:%.*]] ], [ 0.000000e+00, [[SEED_LOOP]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi double [ [[MUL2]], [[BB1]] ], [ 0.000000e+00, [[SEED_LOOP]] ] +; CHECK-NEXT: [[PHI3:%.*]] = phi double [ [[MUL3]], [[BB1]] ], [ 0.000000e+00, [[SEED_LOOP]] ] +; CHECK-NEXT: [[PHI4:%.*]] = phi double [ [[MUL4]], [[BB1]] ], [ 0.000000e+00, [[SEED_LOOP]] ] ; CHECK-NEXT: br label [[SEED_LOOP]] ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -15,21 +15,21 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], -; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], +; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[SHUFFLE1]], ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_8_I_I]], i32 9 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> ; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,41 +23,40 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 +; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 +; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 +; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 ; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -277,20 +277,19 @@ ; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1 -; ENABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], +; ENABLED-NEXT: [[TMP0:%.*]] = fadd fast double [[A0]], 2.000000e+00 +; ENABLED-NEXT: [[TMP1:%.*]] = fadd fast double [[B1]], 2.000000e+00 ; ENABLED-NEXT: br label [[BB:%.*]] ; ENABLED: bb: ; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] -; ENABLED-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; ENABLED-NEXT: [[SUM0:%.*]] = fadd fast double [[TMP0]], [[B0]] +; ENABLED-NEXT: [[SUM1:%.*]] = fadd fast double [[TMP1]], [[A1]] +; ENABLED-NEXT: store double [[SUM0]], double* [[IDXS0]], align 8 +; ENABLED-NEXT: store double [[SUM1]], double* [[IDXS1]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -189,13 +189,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 +; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -222,21 +223,22 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -561,13 +563,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[INCDEC_PTR3]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 +; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 +; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -594,21 +597,22 @@ define void @addsub1f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP6]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll @@ -25,10 +25,14 @@ ; CHECK-VF4-160-NEXT: ret void ; ; CHECK-VF2-160-LABEL: @foo( -; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1 -; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1 -; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1 -; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1 +; CHECK-VF2-160-NEXT: store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1 +; CHECK-VF2-160-NEXT: store i32 2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 1), align 1 +; CHECK-VF2-160-NEXT: store i32 3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2), align 1 +; CHECK-VF2-160-NEXT: store i32 4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 3), align 1 +; CHECK-VF2-160-NEXT: store i32 5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4), align 1 +; CHECK-VF2-160-NEXT: store i32 6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 5), align 1 +; CHECK-VF2-160-NEXT: store i32 7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6), align 1 +; CHECK-VF2-160-NEXT: store i32 8, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 7), align 1 ; CHECK-VF2-160-NEXT: ret void ; ; CHECK-VF8-128-LABEL: @foo( @@ -42,10 +46,14 @@ ; CHECK-VF4-128-NEXT: ret void ; ; CHECK-VF2-128-LABEL: @foo( -; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1 -; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1 -; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1 -; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1 +; CHECK-VF2-128-NEXT: store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1 +; CHECK-VF2-128-NEXT: store i32 2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 1), align 1 +; CHECK-VF2-128-NEXT: store i32 3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2), align 1 +; CHECK-VF2-128-NEXT: store i32 4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 3), align 1 +; CHECK-VF2-128-NEXT: store i32 5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4), align 1 +; CHECK-VF2-128-NEXT: store i32 6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 5), align 1 +; CHECK-VF2-128-NEXT: store i32 7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6), align 1 +; CHECK-VF2-128-NEXT: store i32 8, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 7), align 1 ; CHECK-VF2-128-NEXT: ret void ; ; CHECK-VF8-256-LABEL: @foo(