Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -156,6 +156,117 @@ return true; } +/// Checks if the vector of instructions can be represented as a shuffle, like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %x0x0 = mul i8 %x0, %x0 +/// %x3x3 = mul i8 %x3, %x3 +/// %y1y1 = mul i8 %y1, %y1 +/// %y2y2 = mul i8 %y2, %y2 +/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 +/// ret <4 x i8> %ins4 +/// can be transformed into: +/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> +/// %2 = mul <4 x i8> %1, %1 +/// ret <4 x i8> %2 +/// We convert this initially to something like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0 +/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 +/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 +/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 +/// %5 = mul <4 x i8> %4, %4 +/// %6 = extractelement <4 x i8> %5, i32 0 +/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0 +/// %7 = extractelement <4 x i8> %5, i32 1 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 +/// %8 = extractelement <4 x i8> %5, i32 2 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 +/// %9 = extractelement <4 x i8> %5, i32 3 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 +/// ret <4 x i8> %ins4 +/// InstCombiner transforms this into a shuffle and vector mul +static Optional +isShuffle(ArrayRef VL) { + auto *EI0 = cast(VL[0]); + unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); + Value *Vec1 = nullptr; + Value *Vec2 = nullptr; + enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute}; + ShuffleMode CommonShuffleMode = Unknown; + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + auto *EI = cast(VL[I]); + auto *Vec = EI->getVectorOperand(); + // All vector operands must have the same number of vector elements. + if (Vec->getType()->getVectorNumElements() != Size) + return None; + auto *Idx = dyn_cast(EI->getIndexOperand()); + if (!Idx) + return None; + // Undefined behaviour if Idx is negative or >= Size. + if (Idx->getValue().isNegative()) + continue; + unsigned IntIdx = Idx->getValue().getZExtValue(); + if (IntIdx >= Size) + continue; + // We can extractelement from undef vector. + if (isa(Vec)) + continue; + // For correct shuffling we have to have at most 2 different vector operands + // in all extractelement instructions. + if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2) + return None; + if (CommonShuffleMode == Permute) + continue; + // If the extract index is not the same as the operation number, it is a + // permutation. + if (IntIdx != I) { + CommonShuffleMode = Permute; + continue; + } + // Check the shuffle mode for the current operation. + if (!Vec1) + Vec1 = Vec; + else if (!Vec2) + Vec2 = Vec; + // Example: shufflevector A, B, <0,5,2,7> + // I is odd and IntIdx for A == I - FirstAlternate shuffle. + // I is even and IntIdx for B == I - FirstAlternate shuffle. + // Example: shufflevector A, B, <4,1,6,3> + // I is even and IntIdx for A == I - SecondAlternate shuffle. + // I is odd and IntIdx for B == I - SecondAlternate shuffle. + ShuffleMode CurrentShuffleMode = ((Vec == Vec1) != static_cast(I & 1)) + ? FirstAlternate + : SecondAlternate; + // Common mode is not set or the same as the shuffle mode of the current + // operation - alternate. + if (CommonShuffleMode == Unknown) + CommonShuffleMode = CurrentShuffleMode; + // Common shuffle mode is not the same as the shuffle mode of the current + // operation - permutation. + if (CommonShuffleMode != CurrentShuffleMode) + CommonShuffleMode = Permute; + } + // If we're not crossing lanes in different vectors, consider it as blending. + if ((CommonShuffleMode == FirstAlternate || + CommonShuffleMode == SecondAlternate) && + Vec2) + return TargetTransformInfo::SK_Alternate; + // If Vec2 was never used, we have a permutation of a single vector, otherwise + // we have permutation of 2 vectors. + return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc + : TargetTransformInfo::SK_PermuteSingleSrc; +} + ///\returns Opcode that can be clubbed with \p Op to create an alternate /// sequence which can later be merged as a ShuffleVector instruction. static unsigned getAltOpcode(unsigned Op) { @@ -1734,6 +1845,26 @@ if (isSplat(VL)) { return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } + if (getSameOpcode(VL) == Instruction::ExtractElement) { + Optional ShuffleKind = isShuffle(VL); + if (ShuffleKind.hasValue()) { + int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + if (areAllUsersVectorized(cast(V)) && + !ScalarToTreeEntry.count(V)) { + auto *IO = cast( + cast(V)->getIndexOperand()); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + IO->getZExtValue()); + } + } + return Cost; + } + } return getGatherCost(E->Scalars); } unsigned Opcode = getSameOpcode(VL); Index: test/Transforms/SLPVectorizer/X86/blending-shuffle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -3,13 +3,9 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @g( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0 -; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1 -; CHECK-NEXT: ret <2 x i8> [[INS2]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <2 x i8> [[TMP2]] ; %x0 = extractelement <2 x i8> %x, i32 0 %y1 = extractelement <2 x i8> %y, i32 1 @@ -22,19 +18,9 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @h( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] -; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] -; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0 -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1 -; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2 -; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3 -; CHECK-NEXT: ret <4 x i8> [[INS4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <4 x i8> [[TMP2]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -53,16 +39,9 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @h_undef( -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] -; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1 -; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2 -; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3 -; CHECK-NEXT: ret <4 x i8> [[INS4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <4 x i8> [[TMP2]] ; %x0 = extractelement <4 x i8> undef, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -81,17 +60,13 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @i( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] -; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]] -; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0 ; CHECK-NEXT: ret i8 [[TMP3]] ; %x0 = extractelement <4 x i8> %x, i32 0 @@ -110,18 +85,15 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @j( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] -; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -139,18 +111,15 @@ define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3