diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3858,7 +3858,7 @@ while (!OrderedEntries.empty()) { // 1. Filter out only reordered nodes. // 2. If the entry has multiple uses - skip it and jump to the next node. - MapVector>> Users; + DenseMap>> Users; SmallVector Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || @@ -3886,7 +3886,13 @@ // Erase filtered entries. for_each(Filtered, [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); - for (auto &Data : Users) { + SmallVector< + std::pair>>> + UsersVec(Users.begin(), Users.end()); + sort(UsersVec, [](const auto &Data1, const auto &Data2) { + return Data1.first->Idx > Data2.first->Idx; + }); + for (auto &Data : UsersVec) { // Check that operands are used only in the User node. SmallVector GatherOps; if (!canReorderOperands(Data.first, Data.second, NonVectorized, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -7,65 +7,68 @@ define i16 @reduce_allstrided(i16* nocapture noundef readonly %x, i16* nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_allstrided( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[MUL2:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[MUL2]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM3]] -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[MUL5]] to i64 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 ; CHECK-NEXT: [[MUL8:%.*]] = shl nsw i32 [[STRIDE]], 2 ; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[MUL8]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 ; CHECK-NEXT: [[MUL11:%.*]] = mul nsw i32 [[STRIDE]], 5 ; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[MUL11]] to i64 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 ; CHECK-NEXT: [[MUL14:%.*]] = mul nsw i32 [[STRIDE]], 6 ; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[MUL14]] to i64 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM15]] -; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 ; CHECK-NEXT: [[MUL17:%.*]] = mul nsw i32 [[STRIDE]], 7 ; CHECK-NEXT: [[IDXPROM18:%.*]] = sext i32 [[MUL17]] to i64 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM18]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM3]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM18]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[X]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[Y]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM3]] ; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] ; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] ; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX35]], align 2 -; CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX38]], align 2 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM18]] ; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX41]], align 2 -; CHECK-NEXT: [[MUL43:%.*]] = mul i16 [[TMP8]], [[TMP0]] -; CHECK-NEXT: [[MUL48:%.*]] = mul i16 [[TMP9]], [[TMP1]] -; CHECK-NEXT: [[ADD49:%.*]] = add i16 [[MUL48]], [[MUL43]] -; CHECK-NEXT: [[MUL54:%.*]] = mul i16 [[TMP10]], [[TMP2]] -; CHECK-NEXT: [[ADD55:%.*]] = add i16 [[ADD49]], [[MUL54]] -; CHECK-NEXT: [[MUL60:%.*]] = mul i16 [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[ADD61:%.*]] = add i16 [[ADD55]], [[MUL60]] -; CHECK-NEXT: [[MUL66:%.*]] = mul i16 [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[ADD67:%.*]] = add i16 [[ADD61]], [[MUL66]] -; CHECK-NEXT: [[MUL72:%.*]] = mul i16 [[TMP13]], [[TMP5]] -; CHECK-NEXT: [[ADD73:%.*]] = add i16 [[ADD67]], [[MUL72]] -; CHECK-NEXT: [[MUL78:%.*]] = mul i16 [[TMP14]], [[TMP6]] -; CHECK-NEXT: [[ADD79:%.*]] = add i16 [[ADD73]], [[MUL78]] -; CHECK-NEXT: [[MUL84:%.*]] = mul i16 [[TMP15]], [[TMP7]] -; CHECK-NEXT: [[ADD85:%.*]] = add i16 [[ADD79]], [[MUL84]] -; CHECK-NEXT: ret i16 [[ADD85]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> poison, i16 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP10]], i64 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP11]], i64 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP12]], i64 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP13]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x i16> [[TMP21]], i16 [[TMP14]], i64 6 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i16> [[TMP22]], i16 [[TMP15]], i64 7 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i16> [[TMP24]], i16 [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i16> [[TMP25]], i16 [[TMP2]], i64 2 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i16> [[TMP26]], i16 [[TMP3]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i16> [[TMP27]], i16 [[TMP4]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[TMP5]], i64 5 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i16> [[TMP29]], i16 [[TMP6]], i64 6 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i16> [[TMP30]], i16 [[TMP7]], i64 7 +; CHECK-NEXT: [[TMP32:%.*]] = mul <8 x i16> [[TMP23]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP32]]) +; CHECK-NEXT: ret i16 [[TMP33]] ; entry: %0 = load i16, i16* %x, align 2 @@ -132,63 +135,46 @@ define i16 @reduce_blockstrided2(i16* nocapture noundef readonly %x, i16* nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[X:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[ADD3]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM4]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM7:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM7]] -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX8]], align 2 -; CHECK-NEXT: [[ADD10:%.*]] = or i32 [[MUL]], 1 -; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[ADD10]] to i64 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX12]], align 2 ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[MUL13]] to i64 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM15]] -; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL13]], 1 -; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX24]], align 2 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM7]] -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM15]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX33]], align 2 -; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM4]] -; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX36]], align 2 -; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX40]], align 2 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX44]], align 2 -; CHECK-NEXT: [[MUL46:%.*]] = mul i16 [[TMP8]], [[TMP0]] -; CHECK-NEXT: [[MUL52:%.*]] = mul i16 [[TMP12]], [[TMP1]] -; CHECK-NEXT: [[MUL58:%.*]] = mul i16 [[TMP9]], [[TMP2]] -; CHECK-NEXT: [[MUL64:%.*]] = mul i16 [[TMP13]], [[TMP3]] -; CHECK-NEXT: [[MUL70:%.*]] = mul i16 [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[MUL76:%.*]] = mul i16 [[TMP14]], [[TMP5]] -; CHECK-NEXT: [[MUL82:%.*]] = mul i16 [[TMP11]], [[TMP6]] -; CHECK-NEXT: [[MUL88:%.*]] = mul i16 [[TMP15]], [[TMP7]] -; CHECK-NEXT: [[ADD53:%.*]] = add i16 [[MUL58]], [[MUL46]] -; CHECK-NEXT: [[ADD59:%.*]] = add i16 [[ADD53]], [[MUL70]] -; CHECK-NEXT: [[ADD65:%.*]] = add i16 [[ADD59]], [[MUL82]] -; CHECK-NEXT: [[ADD71:%.*]] = add i16 [[ADD65]], [[MUL52]] -; CHECK-NEXT: [[ADD77:%.*]] = add i16 [[ADD71]], [[MUL64]] -; CHECK-NEXT: [[ADD83:%.*]] = add i16 [[ADD77]], [[MUL76]] -; CHECK-NEXT: [[ADD89:%.*]] = add i16 [[ADD83]], [[MUL88]] -; CHECK-NEXT: ret i16 [[ADD89]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <2 x i16>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX2]] to <2 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX8]] to <2 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX16]] to <2 x i16>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[Y]] to <2 x i16>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i16>, <2 x i16>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX24]] to <2 x i16>* +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i16>, <2 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16* [[ARRAYIDX28]] to <2 x i16>* +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i16>, <2 x i16>* [[TMP12]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[ARRAYIDX32]] to <2 x i16>* +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i16>, <2 x i16>* [[TMP14]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP16]], <8 x i16> [[TMP17]], <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x i16> [[TMP21]], <8 x i16> [[TMP22]], <8 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x i16> [[TMP23]], <8 x i16> [[TMP24]], <8 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i16> [[TMP20]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP26]]) +; CHECK-NEXT: ret i16 [[TMP27]] ; entry: %0 = load i16, i16* %x, align 2 @@ -253,45 +239,44 @@ define i16 @reduce_blockstrided3(i16* nocapture noundef readonly %x, i16* nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L0:%.*]] = load i16, i16* [[X:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 1 -; CHECK-NEXT: [[L1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 2 -; CHECK-NEXT: [[L2:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: [[L4:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[L5:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 ; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 ; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] ; CHECK-NEXT: [[L6:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[L8:%.*]] = load i16, i16* [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 1 -; CHECK-NEXT: [[L9:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 2 -; CHECK-NEXT: [[L10:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[L12:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[L13:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] ; CHECK-NEXT: [[L14:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[L8]], [[L0]] -; CHECK-NEXT: [[MUL36:%.*]] = mul i16 [[L9]], [[L1]] -; CHECK-NEXT: [[ADD37:%.*]] = add i16 [[MUL36]], [[MUL]] -; CHECK-NEXT: [[MUL48:%.*]] = mul i16 [[L10]], [[L2]] -; CHECK-NEXT: [[ADD49:%.*]] = add i16 [[ADD37]], [[MUL48]] -; CHECK-NEXT: [[MUL54:%.*]] = mul i16 [[L13]], [[L5]] -; CHECK-NEXT: [[ADD55:%.*]] = add i16 [[ADD49]], [[MUL54]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <2 x i16>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 2 +; CHECK-NEXT: [[L2:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[L5:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[Y]] to <2 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[L10:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 +; CHECK-NEXT: [[L13:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[L10]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[L13]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[L2]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[L5]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i16> [[TMP6]], [[TMP9]] ; CHECK-NEXT: [[MUL60:%.*]] = mul i16 [[L12]], [[L4]] -; CHECK-NEXT: [[ADD61:%.*]] = add i16 [[ADD55]], [[MUL60]] ; CHECK-NEXT: [[MUL72:%.*]] = mul i16 [[L14]], [[L6]] -; CHECK-NEXT: [[ADD73:%.*]] = add i16 [[ADD61]], [[MUL72]] -; CHECK-NEXT: ret i16 [[ADD73]] +; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[MUL60]], [[MUL72]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i16 [[TMP11]], [[OP_RDX]] +; CHECK-NEXT: ret i16 [[OP_RDX1]] ; entry: %l0 = load i16, i16* %x, align 2 @@ -689,70 +674,96 @@ define void @store_blockstrided3(i32* nocapture noundef readonly %x, i32* nocapture noundef readonly %y, i32* nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[STRIDE]], 2 +; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[ADD7]] to i64 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM8]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = or i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4 ; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 ; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX24]], align 4 ; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 ; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 ; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 +; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[Y:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM8]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX52]], align 4 ; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX56]], align 4 ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX60]], align 4 ; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, i32* [[Z:%.*]], i64 1 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX68]], align 4 +; CHECK-NEXT: [[MUL69:%.*]] = mul nsw i32 [[TMP11]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX70:%.*]] = getelementptr inbounds i32, i32* [[Z:%.*]], i64 2 +; CHECK-NEXT: store i32 [[MUL69]], i32* [[ARRAYIDX70]], align 4 +; CHECK-NEXT: [[MUL71:%.*]] = mul nsw i32 [[TMP12]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 1 +; CHECK-NEXT: store i32 [[MUL71]], i32* [[ARRAYIDX72]], align 4 +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP13]], [[TMP2]] +; CHECK-NEXT: store i32 [[MUL73]], i32* [[Z]], align 4 ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[X]] to <2 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX6]] to <2 x i32>* -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[Y]] to <2 x i32>* -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX41]] to <2 x i32>* -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* +; CHECK-NEXT: [[MUL77:%.*]] = mul nsw i32 [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX78:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 4 +; CHECK-NEXT: store i32 [[MUL77]], i32* [[ARRAYIDX78]], align 4 +; CHECK-NEXT: [[MUL79:%.*]] = mul nsw i32 [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 3 +; CHECK-NEXT: store i32 [[MUL79]], i32* [[ARRAYIDX80]], align 4 +; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP16]], [[TMP5]] +; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 8 +; CHECK-NEXT: store i32 [[MUL81]], i32* [[ARRAYIDX82]], align 4 +; CHECK-NEXT: [[MUL83:%.*]] = mul nsw i32 [[TMP17]], [[TMP6]] ; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 7 -; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 11 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>* -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>* -; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i32>, <2 x i32>* [[TMP19]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>* -; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>* -; CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, <2 x i32>* [[TMP23]], align 4 -; CHECK-NEXT: store i32 [[MUL73]], i32* [[Z]], align 4 -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP16]], align 4 +; CHECK-NEXT: store i32 [[MUL83]], i32* [[ARRAYIDX84]], align 4 +; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP18]], [[TMP7]] ; CHECK-NEXT: store i32 [[MUL85]], i32* [[ARRAYIDX76]], align 4 +; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP19]], [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 11 ; CHECK-NEXT: store i32 [[MUL87]], i32* [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP27]], align 4 +; CHECK-NEXT: [[MUL89:%.*]] = mul nsw i32 [[TMP20]], [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 10 +; CHECK-NEXT: store i32 [[MUL89]], i32* [[ARRAYIDX90]], align 4 +; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP21]], [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 9 +; CHECK-NEXT: store i32 [[MUL91]], i32* [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -852,22 +863,66 @@ define void @store_blockstrided4(i16* nocapture noundef readonly %x, i16* nocapture noundef readonly %y, i32 noundef %stride, i16 *%dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[X:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 +; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 +; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX16]], align 2 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX17]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 +; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[TMP8]], [[TMP0]] +; CHECK-NEXT: [[MUL36:%.*]] = mul i16 [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[MUL42:%.*]] = mul i16 [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[MUL48:%.*]] = mul i16 [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[MUL54:%.*]] = mul i16 [[TMP13]], [[TMP5]] +; CHECK-NEXT: [[MUL60:%.*]] = mul i16 [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[MUL66:%.*]] = mul i16 [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[MUL72:%.*]] = mul i16 [[TMP14]], [[TMP6]] +; CHECK-NEXT: [[DST1:%.*]] = getelementptr inbounds i16, i16* [[DST0:%.*]], i64 1 +; CHECK-NEXT: [[DST2:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 2 +; CHECK-NEXT: [[DST3:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 3 +; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 4 +; CHECK-NEXT: [[DST5:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 5 +; CHECK-NEXT: [[DST6:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 6 +; CHECK-NEXT: [[DST7:%.*]] = getelementptr inbounds i16, i16* [[DST0]], i64 7 +; CHECK-NEXT: store i16 [[MUL]], i16* [[DST0]], align 2 +; CHECK-NEXT: store i16 [[MUL36]], i16* [[DST1]], align 2 +; CHECK-NEXT: store i16 [[MUL42]], i16* [[DST2]], align 2 +; CHECK-NEXT: store i16 [[MUL48]], i16* [[DST3]], align 2 +; CHECK-NEXT: store i16 [[MUL54]], i16* [[DST4]], align 2 +; CHECK-NEXT: store i16 [[MUL60]], i16* [[DST5]], align 2 +; CHECK-NEXT: store i16 [[MUL66]], i16* [[DST6]], align 2 +; CHECK-NEXT: store i16 [[MUL72]], i16* [[DST7]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -939,51 +994,147 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P1:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[P2:%.*]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP5]] to i32 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX13]], align 1 +; CHECK-NEXT: [[CONV14:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 +; CHECK-NEXT: [[CONV16:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[CONV21:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 +; CHECK-NEXT: [[CONV23:%.*]] = zext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX25]], align 1 +; CHECK-NEXT: [[CONV26:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 +; CHECK-NEXT: [[CONV28:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 +; CHECK-NEXT: [[CONV35:%.*]] = zext i8 [[TMP13]] to i32 +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX37]], align 1 +; CHECK-NEXT: [[CONV38:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 +; CHECK-NEXT: [[CONV40:%.*]] = zext i8 [[TMP15]] to i32 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, i32* [[DST0:%.*]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 +; CHECK-NEXT: [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[ARRAYIDX13_1]], align 1 +; CHECK-NEXT: [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 +; CHECK-NEXT: [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[ARRAYIDX20_1]], align 1 +; CHECK-NEXT: [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[ARRAYIDX25_1]], align 1 +; CHECK-NEXT: [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 +; CHECK-NEXT: [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 +; CHECK-NEXT: [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX37_1]], align 1 +; CHECK-NEXT: [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 +; CHECK-NEXT: [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[M1:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[M2:%.*]] = mul nuw nsw i32 [[CONV9]], [[CONV14]] +; CHECK-NEXT: [[M3:%.*]] = mul nuw nsw i32 [[CONV21]], [[CONV26]] +; CHECK-NEXT: [[M4:%.*]] = mul nuw nsw i32 [[CONV33]], [[CONV38]] +; CHECK-NEXT: [[M5:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV6]] +; CHECK-NEXT: [[M6:%.*]] = mul nuw nsw i32 [[CONV11]], [[CONV16]] +; CHECK-NEXT: [[M7:%.*]] = mul nuw nsw i32 [[CONV23]], [[CONV28]] +; CHECK-NEXT: [[M8:%.*]] = mul nuw nsw i32 [[CONV35]], [[CONV40]] +; CHECK-NEXT: [[M9:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV4_1]] +; CHECK-NEXT: [[M10:%.*]] = mul nuw nsw i32 [[CONV9_1]], [[CONV14_1]] +; CHECK-NEXT: [[M11:%.*]] = mul nuw nsw i32 [[CONV21_1]], [[CONV26_1]] +; CHECK-NEXT: [[M12:%.*]] = mul nuw nsw i32 [[CONV33_1]], [[CONV38_1]] +; CHECK-NEXT: [[M13:%.*]] = mul nuw nsw i32 [[CONV2_1]], [[CONV6_1]] +; CHECK-NEXT: [[M14:%.*]] = mul nuw nsw i32 [[CONV11_1]], [[CONV16_1]] +; CHECK-NEXT: [[M15:%.*]] = mul nuw nsw i32 [[CONV23_1]], [[CONV28_1]] +; CHECK-NEXT: [[M16:%.*]] = mul nuw nsw i32 [[CONV35_1]], [[CONV40_1]] +; CHECK-NEXT: [[DST1:%.*]] = getelementptr inbounds i32, i32* [[DST0:%.*]], i64 1 +; CHECK-NEXT: [[DST2:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 2 +; CHECK-NEXT: [[DST3:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 3 +; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 4 +; CHECK-NEXT: [[DST5:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 5 +; CHECK-NEXT: [[DST6:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 6 +; CHECK-NEXT: [[DST7:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 7 ; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 8 +; CHECK-NEXT: [[DST9:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 9 +; CHECK-NEXT: [[DST10:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 10 +; CHECK-NEXT: [[DST11:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 11 ; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 12 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[DST0]] to <4 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[DST4]] to <4 x i32>* -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[DST8]] to <4 x i32>* -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP28:%.*]] = load <4 x i8>, <4 x i8>* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[DST12]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP31]], align 4 +; CHECK-NEXT: [[DST13:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 13 +; CHECK-NEXT: [[DST14:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 14 +; CHECK-NEXT: [[DST15:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 15 +; CHECK-NEXT: store i32 [[M1]], i32* [[DST0]], align 4 +; CHECK-NEXT: store i32 [[M2]], i32* [[DST1]], align 4 +; CHECK-NEXT: store i32 [[M3]], i32* [[DST2]], align 4 +; CHECK-NEXT: store i32 [[M4]], i32* [[DST3]], align 4 +; CHECK-NEXT: store i32 [[M5]], i32* [[DST4]], align 4 +; CHECK-NEXT: store i32 [[M6]], i32* [[DST5]], align 4 +; CHECK-NEXT: store i32 [[M7]], i32* [[DST6]], align 4 +; CHECK-NEXT: store i32 [[M8]], i32* [[DST7]], align 4 +; CHECK-NEXT: store i32 [[M9]], i32* [[DST8]], align 4 +; CHECK-NEXT: store i32 [[M10]], i32* [[DST9]], align 4 +; CHECK-NEXT: store i32 [[M11]], i32* [[DST10]], align 4 +; CHECK-NEXT: store i32 [[M12]], i32* [[DST11]], align 4 +; CHECK-NEXT: store i32 [[M13]], i32* [[DST12]], align 4 +; CHECK-NEXT: store i32 [[M14]], i32* [[DST13]], align 4 +; CHECK-NEXT: store i32 [[M15]], i32* [[DST14]], align 4 +; CHECK-NEXT: store i32 [[M16]], i32* [[DST15]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -S | FileCheck %s + +; This checks that reorderBottomToTop() can handle reordering of a TreeEntry +; which has a user TreeEntry that has already been reordered. +; Here is when the crash occurs: +; +; (N4)OrderB +; | +; (N1)OrderA (N2)OrderA (N3)NoOrder +; \ | / +; (Phi)NoOrder +; +; 1. Phi is visited along with its operands (N1,N2,N3). BestOrder is "OrderA". +; 2. Phi along with all its operands (N1,N2,N3) are reordered. The result is: +; +; (N4)OrderB +; | +; (N1)NoOrder (N2)NoOrder (N3)OrderA +; \ | / +; (Phi)OrderA +; +; 3. N3 is now visited along with its operand N4. BestOrder is "OrderB". +; 4. N3 and N4 are reordered. The result is this: +; +; (N4)NoOrder +; | +; (N1)NoOrder (N2)NoOrder (N3)OrderB +; \ | / +; (Phi)OrderA +; +; At this point there is a discrepancy between Phi's Operand 2 which are +; reordered based on OrderA and N3's OrderB. This results in a crash in +; vectorizeTree() on its way from N3 back to the Phi. The reason is that +; N3->isSame(Phi's operand 2) returns false and vectorizeTree() skips N3. +; +; This patch changes the order in which the nodes are visited to bottom-up, +; which fixes the issue. +; +; NOTE: The crash shows up when reorderTopToBottom() does not reorder the tree, +; so to simulate this we add external store users. Alternatively one can +; comment out reorderTopToBottom() and remove the stores. + + +define void @reorder_crash(float* %ptr) { +; CHECK-LABEL: @reorder_crash( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0 +; CHECK-NEXT: br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb12: +; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP1]], [[BB0]] ], [ [[TMP4]], [[BB1]] ], [ [[SHUFFLE]], [[BB2]] ] +; CHECK-NEXT: ret void +; +entry: + %gep0 = getelementptr inbounds float, float* %ptr, i64 0 + %gep1 = getelementptr inbounds float, float* %ptr, i64 1 + %gep2 = getelementptr inbounds float, float* %ptr, i64 2 + %gep3 = getelementptr inbounds float, float* %ptr, i64 3 + br i1 undef, label %bb0, label %bb12 + +bb0: + ; Used by phi in this order: 1, 0, 2, 3 + %ld00 = load float, float* %gep0 + %ld01 = load float, float* %gep1 + %ld02 = load float, float* %gep2 + %ld03 = load float, float* %gep3 + + ; External store users in natural order 0, 1, 2, 3 + store float %ld00, float *%gep0 + store float %ld01, float *%gep1 + store float %ld02, float *%gep2 + store float %ld03, float *%gep3 + br label %bb3 + +bb12: + br i1 undef, label %bb1, label %bb2 + +bb1: + ; Used by phi in this order: 1, 0, 2, 3 + %ld10 = load float, float* %gep0 + %ld11 = load float, float* %gep1 + %ld12 = load float, float* %gep2 + %ld13 = load float, float* %gep3 + + ; External store users in natural order 0, 1, 2, 3 + store float %ld10, float *%gep0 + store float %ld11, float *%gep1 + store float %ld12, float *%gep2 + store float %ld13, float *%gep3 + + br label %bb3 + +bb2: + ; Used by fadd in this order: 2, 3, 0, 1 + %ld20 = load float, float* %gep0 + %ld21 = load float, float* %gep1 + %ld22 = load float, float* %gep2 + %ld23 = load float, float* %gep3 + + ; Used by phi in this order: 0, 1, 2, 3 + %add20 = fadd float %ld22, 0.0 + %add21 = fadd float %ld23, 0.0 + %add22 = fadd float %ld20, 0.0 + %add23 = fadd float %ld21, 0.0 + br label %bb3 + +bb3: + %phi0 = phi float [ %ld01, %bb0 ], [ %ld11, %bb1 ], [ %add20, %bb2 ] + %phi1 = phi float [ %ld00, %bb0 ], [ %ld10, %bb1 ], [ %add21, %bb2 ] + %phi2 = phi float [ %ld02, %bb0 ], [ %ld12, %bb1 ], [ %add22, %bb2 ] + %phi3 = phi float [ %ld03, %bb0 ], [ %ld13, %bb1 ], [ %add23, %bb2 ] + ret void +}