diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -104,6 +104,7 @@ bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldSelectShuffle(Instruction &I); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); @@ -1225,6 +1226,219 @@ return false; } +/// This method looks for groups of shuffles acting on binops, of the form: +/// %x = shuffle ... +/// %y = shuffle ... +/// %a = binop %x, %y +/// %b = binop %x, %y +/// shuffle %a, %b, selectmask +/// We may, especially if the shuffle is wider than legal, be able to convert +/// the shuffle to a form where only parts of a and b need to be computed. On +/// architectures with no obvious "select" shuffle, this can reduce the total +/// number of operations if the target reports them as cheaper. +bool VectorCombine::foldSelectShuffle(Instruction &I) { + auto *SVI = dyn_cast(&I); + auto *VT = dyn_cast(I.getType()); + if (!SVI || !VT) + return false; + auto *Op0 = dyn_cast(SVI->getOperand(0)); + auto *Op1 = dyn_cast(SVI->getOperand(1)); + if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || + VT != Op0->getType()) + return false; + auto *SVI0A = dyn_cast(Op0->getOperand(0)); + auto *SVI0B = dyn_cast(Op0->getOperand(1)); + auto *SVI1A = dyn_cast(Op1->getOperand(0)); + auto *SVI1B = dyn_cast(Op1->getOperand(1)); + auto checkSVNonOpUses = [&](Instruction *I) { + if (!I || I->getOperand(0)->getType() != VT) + return true; + return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + }; + if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || + checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) + return false; + + // Collect all the uses that are shuffles that we can transform together. We + // may not have a single shuffle, but a group that can all be transformed + // together profitably. + SmallVector Shuffles; + auto collectShuffles = [&](Instruction *I) { + for (auto *U : I->users()) { + auto *SV = dyn_cast(U); + if (!SV || SV->getType() != VT) + return false; + if (find(Shuffles, SV) == Shuffles.end()) + Shuffles.push_back(SV); + } + return true; + }; + if (!collectShuffles(Op0) || !collectShuffles(Op1)) + return false; + + // For each of the output shuffles, we try to sort all the first vector + // elements to the beginning, followed by the second array elements at the + // end. If the binops are legalized to smaller vectors, this may reduce total + // number of binops. We compute the ReconstructMask mask needed to convert + // back to the original lane order. + SmallVector V1, V2; + SmallVector> ReconstructMasks; + int MaxV1Elt = 0, MaxV2Elt = 0; + unsigned NumElts = VT->getNumElements(); + for (ShuffleVectorInst *SVN : Shuffles) { + SmallVector Mask; + SVN->getShuffleMask(Mask); + + // Check the operands are the same as the original, or reversed (in which + // case we need to commute the mask). + Value *SVOp0 = SVN->getOperand(0); + Value *SVOp1 = SVN->getOperand(1); + if (SVOp0 == Op1 && SVOp1 == Op0) { + std::swap(SVOp0, SVOp1); + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + } + if (SVOp0 != Op0 || SVOp1 != Op1) + return false; + + // Calculate the reconstruction mask for this shuffle, as the mask needed to + // take the packed values from Op0/Op1 and reconstructing to the original + // order. + SmallVector ReconstructMask; + for (unsigned I = 0; I < Mask.size(); I++) { + if (Mask[I] < 0) { + ReconstructMask.push_back(-1); + } else if (Mask[I] < static_cast(NumElts)) { + MaxV1Elt = std::max(MaxV1Elt, Mask[I]); + auto It = find(V1, Mask[I]); + if (It != V1.end()) + ReconstructMask.push_back(It - V1.begin()); + else { + ReconstructMask.push_back(V1.size()); + V1.push_back(Mask[I]); + } + } else { + MaxV2Elt = std::max(MaxV2Elt, Mask[I] - NumElts); + auto It = find(V2, Mask[I] - NumElts); + if (It != V2.end()) + ReconstructMask.push_back(NumElts + It - V2.begin()); + else { + ReconstructMask.push_back(NumElts + V2.size()); + V2.push_back(Mask[I] - NumElts); + } + } + } + + ReconstructMasks.push_back(ReconstructMask); + } + + // If the Maximum element used from V1 and V2 are not larger than the new + // vectors, the vectors are already packes and performing the optimization + // again will likely not help any further. This also prevents us from getting + // stuck in a cycle in case the costs do not also rule it out. + if (V1.empty() || V2.empty() || + (MaxV1Elt == static_cast(V1.size()) - 1 && + MaxV2Elt == static_cast(V2.size()) - 1)) + return false; + + // Calculate the masks needed for the new input shuffles, which get padded + // with undef + SmallVector V1A, V1B, V2A, V2B; + for (unsigned I = 0; I < V1.size(); I++) { + V1A.push_back(SVI0A->getMaskValue(V1[I])); + V1B.push_back(SVI0B->getMaskValue(V1[I])); + } + for (unsigned I = 0; I < V2.size(); I++) { + V2A.push_back(SVI1A->getMaskValue(V2[I])); + V2B.push_back(SVI1B->getMaskValue(V2[I])); + } + while (V1A.size() < NumElts) { + V1A.push_back(UndefMaskElem); + V1B.push_back(UndefMaskElem); + } + while (V2A.size() < NumElts) { + V2A.push_back(UndefMaskElem); + V2B.push_back(UndefMaskElem); + } + + auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { + return C + + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + }; + auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef Mask) { + return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); + }; + + // Get the costs of the shuffles + binops before and after with the new + // shuffle masks. + InstructionCost CostBefore = + TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); + CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), + InstructionCost(0), AddShuffleCost); + // This set helps us only cost each unique shuffle once. + SmallPtrSet InputShuffles( + {SVI0A, SVI0B, SVI1A, SVI1B}); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + + // The new binops will be unused for lanes past the used shuffle lengths. + // These types attempt to get the correct cost for that from the target. + FixedVectorType *Op0SmallVT = + FixedVectorType::get(VT->getScalarType(), V1.size()); + FixedVectorType *Op1SmallVT = + FixedVectorType::get(VT->getScalarType(), V2.size()); + InstructionCost CostAfter = + TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); + CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + std::set> OutputShuffleMasks({V1A, V1B, V2A, V2B}); + CostAfter += + std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), + InstructionCost(0), AddShuffleMaskCost); + + if (CostBefore <= CostAfter) + return false; + + // The cost model has passed, create the new instructions. + Builder.SetInsertPoint(SVI0A); + Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), + SVI0A->getOperand(1), V1A); + Builder.SetInsertPoint(SVI0B); + Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), + SVI0B->getOperand(1), V1B); + Builder.SetInsertPoint(SVI1A); + Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), + SVI1A->getOperand(1), V2A); + Builder.SetInsertPoint(SVI1B); + Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), + SVI1B->getOperand(1), V2B); + Builder.SetInsertPoint(Op0); + Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), + NSV0A, NSV0B); + if (auto *I = dyn_cast(NOp0)) + I->copyIRFlags(Op0, true); + Builder.SetInsertPoint(Op1); + Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), + NSV1A, NSV1B); + if (auto *I = dyn_cast(NOp1)) + I->copyIRFlags(Op1, true); + + for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { + Builder.SetInsertPoint(Shuffles[S]); + Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); + replaceValue(*Shuffles[S], *NSV); + } + + Worklist.pushValue(NSV0A); + Worklist.pushValue(NSV0B); + Worklist.pushValue(NSV1A); + Worklist.pushValue(NSV1B); + for (auto *S : Shuffles) + Worklist.add(S); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -1245,6 +1459,7 @@ MadeChange |= foldExtractedCmps(I); MadeChange |= foldShuffleOfBinops(I); MadeChange |= foldShuffleFromReductions(I); + MadeChange |= foldSelectShuffle(I); } MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= scalarizeLoadExtract(I); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -128,11 +128,13 @@ define <16 x i32> @test2_2(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test2_2( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -166,11 +168,13 @@ define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test3_1( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -183,11 +187,13 @@ define <16 x i32> @test3_2(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test3_2( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -222,12 +228,14 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test23( -; CHECK-NEXT: [[S10:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S20:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A0:%.*]] = add nsw <16 x i32> [[S10]], [[S20]] -; CHECK-NEXT: [[B0:%.*]] = sub nsw <16 x i32> [[S10]], [[S20]] -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[B0]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[B0]], <16 x i32> [[A0]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] ; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] ; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> @@ -251,11 +259,13 @@ define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @testgood( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -361,14 +371,15 @@ define <16 x i32> @test_1651256324(<16 x i32> %l0, <16 x i32> %l1, <16 x i32> %l6, <16 x i32> %l7) { ; CHECK-LABEL: @test_1651256324( -; CHECK-NEXT: [[S0:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> ; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[L7:%.*]], <16 x i32> [[L7]], <16 x i32> -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <16 x i32> [[S1]], [[S1]] -; CHECK-NEXT: [[SUB:%.*]] = sub <16 x i32> [[S0]], [[S3]] -; CHECK-NEXT: [[T0:%.*]] = shufflevector <16 x i32> [[ADD]], <16 x i32> [[SUB]], <16 x i32> -; CHECK-NEXT: [[T1:%.*]] = shufflevector <16 x i32> [[SUB]], <16 x i32> [[ADD]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <16 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = xor <16 x i32> [[T0]], [[T1]] ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -495,27 +506,29 @@ ; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[TMP59]], [[REORDER]] ; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[REORDER]] -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[REORDER1:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = add nsw <16 x i32> [[TMP62]], [[REORDER1]] -; CHECK-NEXT: [[TMP64:%.*]] = sub nsw <16 x i32> [[TMP62]], [[REORDER1]] -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> [[TMP64]], <16 x i32> -; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> [[TMP64]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP65]], [[REORDER2]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP65]], [[REORDER2]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER3]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER3]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = lshr <16 x i32> [[TMP71]], -; CHECK-NEXT: [[TMP73:%.*]] = and <16 x i32> [[TMP72]], -; CHECK-NEXT: [[TMP74:%.*]] = mul nuw <16 x i32> [[TMP73]], -; CHECK-NEXT: [[TMP75:%.*]] = add <16 x i32> [[TMP74]], [[TMP71]] -; CHECK-NEXT: [[TMP76:%.*]] = xor <16 x i32> [[TMP75]], [[TMP74]] -; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP76]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP77]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP77]], 16 +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER2]] +; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER2]] +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> +; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP71]], [[REORDER3]] +; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP71]], [[REORDER3]] +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = lshr <16 x i32> [[TMP74]], +; CHECK-NEXT: [[TMP76:%.*]] = and <16 x i32> [[TMP75]], +; CHECK-NEXT: [[TMP77:%.*]] = mul nuw <16 x i32> [[TMP76]], +; CHECK-NEXT: [[TMP78:%.*]] = add <16 x i32> [[TMP77]], [[TMP74]] +; CHECK-NEXT: [[TMP79:%.*]] = xor <16 x i32> [[TMP78]], [[TMP77]] +; CHECK-NEXT: [[TMP80:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP79]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP80]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP80]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] @@ -700,27 +713,29 @@ ; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[TMP51]], [[REORDER]] ; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[REORDER]] -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[REORDER191:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[TMP54]], [[REORDER191]] -; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[REORDER191]] -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> [[TMP56]], <16 x i32> -; CHECK-NEXT: [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> [[TMP56]], <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[TMP57]], [[REORDER192]] -; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP57]], [[REORDER192]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER193]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER193]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = lshr <16 x i32> [[TMP63]], -; CHECK-NEXT: [[TMP65:%.*]] = and <16 x i32> [[TMP64]], -; CHECK-NEXT: [[TMP66:%.*]] = mul nuw <16 x i32> [[TMP65]], -; CHECK-NEXT: [[TMP67:%.*]] = add <16 x i32> [[TMP66]], [[TMP63]] -; CHECK-NEXT: [[TMP68:%.*]] = xor <16 x i32> [[TMP67]], [[TMP66]] -; CHECK-NEXT: [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP69]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP69]], 16 +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]] +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER192]] +; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER192]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP63]], [[REORDER193]] +; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP63]], [[REORDER193]] +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = lshr <16 x i32> [[TMP66]], +; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i32> [[TMP67]], +; CHECK-NEXT: [[TMP69:%.*]] = mul nuw <16 x i32> [[TMP68]], +; CHECK-NEXT: [[TMP70:%.*]] = add <16 x i32> [[TMP69]], [[TMP66]] +; CHECK-NEXT: [[TMP71:%.*]] = xor <16 x i32> [[TMP70]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP71]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP72]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP72]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]