Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6821,7 +6821,7 @@ const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (TreeEntry *E = getTreeEntry(S.OpValue)) { if (E->isSame(VL)) { Value *V = vectorizeTree(E); if (VF != cast(V->getType())->getNumElements()) { @@ -6871,7 +6871,40 @@ } } return V; + } else { + // If the values are not the same but a re-ordering of the existing + // node, create a shuffle for the new value. + auto IsReordering = [](TreeEntry *E, ArrayRef VL, + SmallVector &Reordering) { + for (Value *V : VL) { + auto It = find(E->Scalars, V); + if (It == E->Scalars.end()) + return false; + Reordering.push_back(E->findLaneForValue(V)); + } + return true; + }; + SmallVector Reordering; + if (IsReordering(E, VL, Reordering)) { + Value *V = vectorizeTree(E); + Value *V2 = PoisonValue::get(V->getType()); + // If the vectorized value is itself a shuffle, look through it to the + // underlying values. + if (auto *SV = dyn_cast(V)) { + for (int &E : Reordering) + E = E < 0 ? UndefMaskElem : SV->getMaskValue(E); + V = SV->getOperand(0); + V2 = SV->getOperand(1); + } + V = Builder.CreateShuffleVector(V, V2, Reordering, "reorder"); + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return V; + } } + } } // Can't vectorize this, so simply build a new vector with each lane Index: llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1645,154 +1645,30 @@ ; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP54]], [[TMP64]] ; CHECK-NEXT: [[TMP66:%.*]] = shl nsw <16 x i32> [[TMP65]], ; CHECK-NEXT: [[TMP67:%.*]] = add nsw <16 x i32> [[TMP66]], [[TMP44]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i32> [[TMP67]], i32 1 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <16 x i32> poison, i32 [[TMP68]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i32> [[TMP67]], i32 0 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i32> [[TMP69]], i32 [[TMP70]], i32 1 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x i32> [[TMP67]], i32 3 -; CHECK-NEXT: [[TMP73:%.*]] = insertelement <16 x i32> [[TMP71]], i32 [[TMP72]], i32 2 -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i32> [[TMP67]], i32 2 -; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i32> [[TMP73]], i32 [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i32> [[TMP67]], i32 5 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <16 x i32> [[TMP75]], i32 [[TMP76]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <16 x i32> [[TMP67]], i32 4 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <16 x i32> [[TMP77]], i32 [[TMP78]], i32 5 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x i32> [[TMP67]], i32 7 -; CHECK-NEXT: [[TMP81:%.*]] = insertelement <16 x i32> [[TMP79]], i32 [[TMP80]], i32 6 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i32> [[TMP67]], i32 6 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <16 x i32> [[TMP81]], i32 [[TMP82]], i32 7 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <16 x i32> [[TMP67]], i32 9 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i32> [[TMP83]], i32 [[TMP84]], i32 8 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <16 x i32> [[TMP67]], i32 8 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <16 x i32> [[TMP85]], i32 [[TMP86]], i32 9 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x i32> [[TMP67]], i32 11 -; CHECK-NEXT: [[TMP89:%.*]] = insertelement <16 x i32> [[TMP87]], i32 [[TMP88]], i32 10 -; CHECK-NEXT: [[TMP90:%.*]] = extractelement <16 x i32> [[TMP67]], i32 10 -; CHECK-NEXT: [[TMP91:%.*]] = insertelement <16 x i32> [[TMP89]], i32 [[TMP90]], i32 11 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i32> [[TMP67]], i32 13 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <16 x i32> [[TMP91]], i32 [[TMP92]], i32 12 -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <16 x i32> [[TMP67]], i32 12 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i32> [[TMP93]], i32 [[TMP94]], i32 13 -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <16 x i32> [[TMP67]], i32 15 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <16 x i32> [[TMP95]], i32 [[TMP96]], i32 14 -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i32> [[TMP67]], i32 14 -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <16 x i32> [[TMP97]], i32 [[TMP98]], i32 15 -; CHECK-NEXT: [[TMP100:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP99]] -; CHECK-NEXT: [[TMP101:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP99]] -; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> -; CHECK-NEXT: [[TMP103:%.*]] = extractelement <16 x i32> [[TMP102]], i32 2 -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <16 x i32> poison, i32 [[TMP103]], i32 0 -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i32> [[TMP102]], i32 3 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <16 x i32> [[TMP104]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP107:%.*]] = extractelement <16 x i32> [[TMP102]], i32 0 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <16 x i32> [[TMP106]], i32 [[TMP107]], i32 2 -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i32> [[TMP102]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <16 x i32> [[TMP108]], i32 [[TMP109]], i32 3 -; CHECK-NEXT: [[TMP111:%.*]] = extractelement <16 x i32> [[TMP102]], i32 10 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <16 x i32> [[TMP110]], i32 [[TMP111]], i32 4 -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i32> [[TMP102]], i32 11 -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <16 x i32> [[TMP112]], i32 [[TMP113]], i32 5 -; CHECK-NEXT: [[TMP115:%.*]] = extractelement <16 x i32> [[TMP102]], i32 12 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <16 x i32> [[TMP114]], i32 [[TMP115]], i32 6 -; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i32> [[TMP102]], i32 13 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <16 x i32> [[TMP116]], i32 [[TMP117]], i32 7 -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <16 x i32> [[TMP102]], i32 14 -; CHECK-NEXT: [[TMP120:%.*]] = insertelement <16 x i32> [[TMP118]], i32 [[TMP119]], i32 8 -; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i32> [[TMP102]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <16 x i32> [[TMP120]], i32 [[TMP121]], i32 9 -; CHECK-NEXT: [[TMP123:%.*]] = extractelement <16 x i32> [[TMP102]], i32 4 -; CHECK-NEXT: [[TMP124:%.*]] = insertelement <16 x i32> [[TMP122]], i32 [[TMP123]], i32 10 -; CHECK-NEXT: [[TMP125:%.*]] = extractelement <16 x i32> [[TMP102]], i32 5 -; CHECK-NEXT: [[TMP126:%.*]] = insertelement <16 x i32> [[TMP124]], i32 [[TMP125]], i32 11 -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <16 x i32> [[TMP102]], i32 6 -; CHECK-NEXT: [[TMP128:%.*]] = insertelement <16 x i32> [[TMP126]], i32 [[TMP127]], i32 12 -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <16 x i32> [[TMP102]], i32 7 -; CHECK-NEXT: [[TMP130:%.*]] = insertelement <16 x i32> [[TMP128]], i32 [[TMP129]], i32 13 -; CHECK-NEXT: [[TMP131:%.*]] = extractelement <16 x i32> [[TMP102]], i32 8 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <16 x i32> [[TMP130]], i32 [[TMP131]], i32 14 -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <16 x i32> [[TMP102]], i32 9 -; CHECK-NEXT: [[TMP134:%.*]] = insertelement <16 x i32> [[TMP132]], i32 [[TMP133]], i32 15 -; CHECK-NEXT: [[TMP135:%.*]] = sub nsw <16 x i32> [[TMP102]], [[TMP134]] -; CHECK-NEXT: [[TMP136:%.*]] = add nsw <16 x i32> [[TMP102]], [[TMP134]] -; CHECK-NEXT: [[TMP137:%.*]] = shufflevector <16 x i32> [[TMP135]], <16 x i32> [[TMP136]], <16 x i32> -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <16 x i32> [[TMP137]], i32 13 -; CHECK-NEXT: [[TMP139:%.*]] = insertelement <16 x i32> poison, i32 [[TMP138]], i32 0 -; CHECK-NEXT: [[TMP140:%.*]] = extractelement <16 x i32> [[TMP137]], i32 10 -; CHECK-NEXT: [[TMP141:%.*]] = insertelement <16 x i32> [[TMP139]], i32 [[TMP140]], i32 1 -; CHECK-NEXT: [[TMP142:%.*]] = extractelement <16 x i32> [[TMP137]], i32 7 -; CHECK-NEXT: [[TMP143:%.*]] = insertelement <16 x i32> [[TMP141]], i32 [[TMP142]], i32 2 -; CHECK-NEXT: [[TMP144:%.*]] = extractelement <16 x i32> [[TMP137]], i32 4 -; CHECK-NEXT: [[TMP145:%.*]] = insertelement <16 x i32> [[TMP143]], i32 [[TMP144]], i32 3 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <16 x i32> [[TMP137]], i32 3 -; CHECK-NEXT: [[TMP147:%.*]] = insertelement <16 x i32> [[TMP145]], i32 [[TMP146]], i32 4 -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <16 x i32> [[TMP137]], i32 6 -; CHECK-NEXT: [[TMP149:%.*]] = insertelement <16 x i32> [[TMP147]], i32 [[TMP148]], i32 5 -; CHECK-NEXT: [[TMP150:%.*]] = extractelement <16 x i32> [[TMP137]], i32 5 -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <16 x i32> [[TMP149]], i32 [[TMP150]], i32 6 -; CHECK-NEXT: [[TMP152:%.*]] = extractelement <16 x i32> [[TMP137]], i32 2 -; CHECK-NEXT: [[TMP153:%.*]] = insertelement <16 x i32> [[TMP151]], i32 [[TMP152]], i32 7 -; CHECK-NEXT: [[TMP154:%.*]] = extractelement <16 x i32> [[TMP137]], i32 9 -; CHECK-NEXT: [[TMP155:%.*]] = insertelement <16 x i32> [[TMP153]], i32 [[TMP154]], i32 8 -; CHECK-NEXT: [[TMP156:%.*]] = extractelement <16 x i32> [[TMP137]], i32 8 -; CHECK-NEXT: [[TMP157:%.*]] = insertelement <16 x i32> [[TMP155]], i32 [[TMP156]], i32 9 -; CHECK-NEXT: [[TMP158:%.*]] = extractelement <16 x i32> [[TMP137]], i32 1 -; CHECK-NEXT: [[TMP159:%.*]] = insertelement <16 x i32> [[TMP157]], i32 [[TMP158]], i32 10 -; CHECK-NEXT: [[TMP160:%.*]] = extractelement <16 x i32> [[TMP137]], i32 12 -; CHECK-NEXT: [[TMP161:%.*]] = insertelement <16 x i32> [[TMP159]], i32 [[TMP160]], i32 11 -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <16 x i32> [[TMP137]], i32 11 -; CHECK-NEXT: [[TMP163:%.*]] = insertelement <16 x i32> [[TMP161]], i32 [[TMP162]], i32 12 -; CHECK-NEXT: [[TMP164:%.*]] = extractelement <16 x i32> [[TMP137]], i32 0 -; CHECK-NEXT: [[TMP165:%.*]] = insertelement <16 x i32> [[TMP163]], i32 [[TMP164]], i32 13 -; CHECK-NEXT: [[TMP166:%.*]] = extractelement <16 x i32> [[TMP137]], i32 15 -; CHECK-NEXT: [[TMP167:%.*]] = insertelement <16 x i32> [[TMP165]], i32 [[TMP166]], i32 14 -; CHECK-NEXT: [[TMP168:%.*]] = extractelement <16 x i32> [[TMP137]], i32 14 -; CHECK-NEXT: [[TMP169:%.*]] = insertelement <16 x i32> [[TMP167]], i32 [[TMP168]], i32 15 -; CHECK-NEXT: [[TMP170:%.*]] = sub nsw <16 x i32> [[TMP137]], [[TMP169]] -; CHECK-NEXT: [[TMP171:%.*]] = add nsw <16 x i32> [[TMP137]], [[TMP169]] -; CHECK-NEXT: [[TMP172:%.*]] = shufflevector <16 x i32> [[TMP170]], <16 x i32> [[TMP171]], <16 x i32> -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <16 x i32> [[TMP172]], i32 15 -; CHECK-NEXT: [[TMP174:%.*]] = insertelement <16 x i32> poison, i32 [[TMP173]], i32 0 -; CHECK-NEXT: [[TMP175:%.*]] = extractelement <16 x i32> [[TMP172]], i32 12 -; CHECK-NEXT: [[TMP176:%.*]] = insertelement <16 x i32> [[TMP174]], i32 [[TMP175]], i32 1 -; CHECK-NEXT: [[TMP177:%.*]] = extractelement <16 x i32> [[TMP172]], i32 9 -; CHECK-NEXT: [[TMP178:%.*]] = insertelement <16 x i32> [[TMP176]], i32 [[TMP177]], i32 2 -; CHECK-NEXT: [[TMP179:%.*]] = extractelement <16 x i32> [[TMP172]], i32 6 -; CHECK-NEXT: [[TMP180:%.*]] = insertelement <16 x i32> [[TMP178]], i32 [[TMP179]], i32 3 -; CHECK-NEXT: [[TMP181:%.*]] = extractelement <16 x i32> [[TMP172]], i32 5 -; CHECK-NEXT: [[TMP182:%.*]] = insertelement <16 x i32> [[TMP180]], i32 [[TMP181]], i32 4 -; CHECK-NEXT: [[TMP183:%.*]] = extractelement <16 x i32> [[TMP172]], i32 4 -; CHECK-NEXT: [[TMP184:%.*]] = insertelement <16 x i32> [[TMP182]], i32 [[TMP183]], i32 5 -; CHECK-NEXT: [[TMP185:%.*]] = extractelement <16 x i32> [[TMP172]], i32 3 -; CHECK-NEXT: [[TMP186:%.*]] = insertelement <16 x i32> [[TMP184]], i32 [[TMP185]], i32 6 -; CHECK-NEXT: [[TMP187:%.*]] = extractelement <16 x i32> [[TMP172]], i32 8 -; CHECK-NEXT: [[TMP188:%.*]] = insertelement <16 x i32> [[TMP186]], i32 [[TMP187]], i32 7 -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <16 x i32> [[TMP172]], i32 7 -; CHECK-NEXT: [[TMP190:%.*]] = insertelement <16 x i32> [[TMP188]], i32 [[TMP189]], i32 8 -; CHECK-NEXT: [[TMP191:%.*]] = extractelement <16 x i32> [[TMP172]], i32 2 -; CHECK-NEXT: [[TMP192:%.*]] = insertelement <16 x i32> [[TMP190]], i32 [[TMP191]], i32 9 -; CHECK-NEXT: [[TMP193:%.*]] = extractelement <16 x i32> [[TMP172]], i32 11 -; CHECK-NEXT: [[TMP194:%.*]] = insertelement <16 x i32> [[TMP192]], i32 [[TMP193]], i32 10 -; CHECK-NEXT: [[TMP195:%.*]] = extractelement <16 x i32> [[TMP172]], i32 10 -; CHECK-NEXT: [[TMP196:%.*]] = insertelement <16 x i32> [[TMP194]], i32 [[TMP195]], i32 11 -; CHECK-NEXT: [[TMP197:%.*]] = extractelement <16 x i32> [[TMP172]], i32 1 -; CHECK-NEXT: [[TMP198:%.*]] = insertelement <16 x i32> [[TMP196]], i32 [[TMP197]], i32 12 -; CHECK-NEXT: [[TMP199:%.*]] = extractelement <16 x i32> [[TMP172]], i32 14 -; CHECK-NEXT: [[TMP200:%.*]] = insertelement <16 x i32> [[TMP198]], i32 [[TMP199]], i32 13 -; CHECK-NEXT: [[TMP201:%.*]] = extractelement <16 x i32> [[TMP172]], i32 13 -; CHECK-NEXT: [[TMP202:%.*]] = insertelement <16 x i32> [[TMP200]], i32 [[TMP201]], i32 14 -; CHECK-NEXT: [[TMP203:%.*]] = extractelement <16 x i32> [[TMP172]], i32 0 -; CHECK-NEXT: [[TMP204:%.*]] = insertelement <16 x i32> [[TMP202]], i32 [[TMP203]], i32 15 -; CHECK-NEXT: [[TMP205:%.*]] = add nsw <16 x i32> [[TMP172]], [[TMP204]] -; CHECK-NEXT: [[TMP206:%.*]] = sub nsw <16 x i32> [[TMP172]], [[TMP204]] -; CHECK-NEXT: [[TMP207:%.*]] = shufflevector <16 x i32> [[TMP205]], <16 x i32> [[TMP206]], <16 x i32> -; CHECK-NEXT: [[TMP208:%.*]] = lshr <16 x i32> [[TMP207]], -; CHECK-NEXT: [[TMP209:%.*]] = and <16 x i32> [[TMP208]], -; CHECK-NEXT: [[TMP210:%.*]] = mul nuw <16 x i32> [[TMP209]], -; CHECK-NEXT: [[TMP211:%.*]] = add <16 x i32> [[TMP210]], [[TMP207]] -; CHECK-NEXT: [[TMP212:%.*]] = xor <16 x i32> [[TMP211]], [[TMP210]] -; CHECK-NEXT: [[TMP213:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP212]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP213]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP213]], 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = sub nsw <16 x i32> [[TMP67]], [[REORDER]] +; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[REORDER]] +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> +; CHECK-NEXT: [[REORDER1:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP70]], [[REORDER1]] +; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP70]], [[REORDER1]] +; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> [[TMP72]], <16 x i32> +; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> [[TMP72]], <16 x i32> +; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP73]], [[REORDER2]] +; CHECK-NEXT: [[TMP75:%.*]] = add nsw <16 x i32> [[TMP73]], [[REORDER2]] +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> +; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = add nsw <16 x i32> [[TMP76]], [[REORDER3]] +; CHECK-NEXT: [[TMP78:%.*]] = sub nsw <16 x i32> [[TMP76]], [[REORDER3]] +; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <16 x i32> [[TMP77]], <16 x i32> [[TMP78]], <16 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = lshr <16 x i32> [[TMP79]], +; CHECK-NEXT: [[TMP81:%.*]] = and <16 x i32> [[TMP80]], +; CHECK-NEXT: [[TMP82:%.*]] = mul nuw <16 x i32> [[TMP81]], +; CHECK-NEXT: [[TMP83:%.*]] = add <16 x i32> [[TMP82]], [[TMP79]] +; CHECK-NEXT: [[TMP84:%.*]] = xor <16 x i32> [[TMP83]], [[TMP82]] +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP85]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP85]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -139,11 +139,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -139,11 +139,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -82,23 +82,19 @@ ; CHECK-NEXT: ret float [[ADD]] ; ; THRESH1-LABEL: @f_used_twice_in_tree( -; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH1-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH1-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH1-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH1-NEXT: [[REORDER:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH1-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER]], [[X]] +; THRESH1-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]] ; THRESH1-NEXT: ret float [[ADD]] ; ; THRESH2-LABEL: @f_used_twice_in_tree( -; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH2-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH2-NEXT: [[REORDER:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH2-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER]], [[X]] +; THRESH2-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; THRESH2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]] ; THRESH2-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -58,18 +58,11 @@ ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[REORDER]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 Index: llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -722,15 +722,12 @@ ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 -; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]] +; SSE-NEXT: [[REORDER:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[REORDER]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] ; SSE-NEXT: ret double [[ADD3]] ; ; AVX-LABEL: @splat_loads( @@ -791,17 +788,14 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fsub <2 x double> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP12]], i32 1 -; CHECK-NEXT: [[RES:%.*]] = fadd double [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[REORDER]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[REORDER1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP6]], [[REORDER1]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; CHECK-NEXT: ret double [[RES]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/phi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -244,27 +244,20 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP9]] = fmul <4 x float> [[TMP8]], +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1]] = fmul <4 x float> [[REORDER]], ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP5]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -12,22 +12,15 @@ ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]] ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <4 x i32> [[TMP8]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <4 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 16 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds i8, i8* undef, i64 4 Index: llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll @@ -7,21 +7,18 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]] ; CHECK: block1: ; CHECK-NEXT: br i1 false, label [[BLOCK2:%.*]], label [[BLOCK3]] ; CHECK: block2: ; CHECK-NEXT: br label [[BLOCK3]] ; CHECK: block3: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]] -; CHECK-NEXT: ret i32 [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[REORDER]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %arrayidx10 = getelementptr inbounds i32, i32* %isec, i32 0