Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -167,6 +167,24 @@ Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumInts, unsigned NumUndefs); +/// Create a transpose shuffle mask. +/// +/// This function creates a shuffle mask useful for transposing a 2xn matrix. +/// The mask reads corresponding even- or odd-numbered vector elements from two +/// n-dimensional source vectors and writes each result into consecutive +/// elements of an n-dimensional destination vector. The elements of the mask +/// begin with either zero or one depending on the value of \p IsOdd, and are +/// of the form: +/// +/// <0, VF + 0, 2, VF + 2, ..., VF - 2, 2 * VF - 2>, for IsOdd = false, and +/// <1, VF + 1, 3, VF + 3, ..., VF - 1, 2 * VF - 1>, for IsOdd = true. +/// +/// For example, the mask for VF = 4 is: +/// +/// <0, 4, 2, 6>, for IsOdd = false, and +/// <1, 5, 3, 7>, for IsOdd = true +Constant *createTransposeMask(IRBuilder<> &Builder, unsigned VF, bool IsOdd); + /// Concatenate a list of vectors. /// /// This function generates code that concatenate the vectors in \p Vecs into a Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -523,6 +523,17 @@ return ConstantVector::get(Mask); } +Constant *llvm::createTransposeMask(IRBuilder<> &Builder, unsigned VF, + bool IsOdd) { + SmallVector Mask; + unsigned StartVal = IsOdd ? 1 : 0; + for (unsigned I = 0; I < VF; I += 2) + for (unsigned J = 0; J < 2; ++J) + Mask.push_back(Builder.getInt32(StartVal + I + VF * J)); + + return ConstantVector::get(Mask); +} + /// A helper function for concatenating vectors. This function concatenates two /// vectors having the same element type. If the second vector has fewer /// elements than the first, it is padded with undefs. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -747,12 +747,17 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Have the operands of this tree entry been transposed to enable + /// vectorization? + SmallVector TransposedOperands; }; /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, - ArrayRef ReorderIndices = None) { + ArrayRef ReorderIndices = None, + ArrayRef TransposedOperands = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; @@ -761,6 +766,8 @@ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; + Last->TransposedOperands.append(TransposedOperands.begin(), + TransposedOperands.end()); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -1362,6 +1369,178 @@ } // end namespace llvm +/// Try and transpose a bundle of binary operations if doing so would +/// enable vectorization. +/// +/// Given a bundle of isomorphic binary operations, we normally try to +/// vectorize their operands by bundling all operand-zero values together and +/// bundling all operand-one values together. Thus, assuming all operands are +/// instructions, each operand bundle must also be isomorphic. That is, all +/// operand-zero values must have the same opcode and all operand-one values +/// must have the same opcode. For example: +/// +/// ; Original expression: Both operand bundles have the same opcode (the +/// ; operand-zero bundle is an 'add' and the operand-one bundle is a 'sub'). +/// ; The operations are vectorizable. +/// add (add s0 s1) (sub s2, s3) +/// add (add s4 s5) (sub s6, s7) +/// +/// ; Vectorized IR (simplified) +/// %a = add , ; = +/// %b = sub , ; = +/// %c = add %a, %b +/// +/// However, if an operand bundle is nonisomorhpic (i.e., the instructions in +/// the bundle have more than one opcode), it is not vectorizable. If this is +/// the case, we can try to "transpose" the operands such that each resulting +/// bundle will have a single opcode. For example: +/// +/// ; Neither operand bundle is vectorizable because they each contain more +/// ; than one opcode ('add' and 'sub'). However, we can transpose the +/// ; bundles to enable vectorization. +/// add (add s0 s1) (add s2, s3) +/// add (sub s4 s5) (sub s6, s7) +/// +/// ; Vectorized IR (simplified) +/// %a = add , ; = +/// %b = sub , ; = +/// %c = shufflevector %a, %b, <0 2> ; = +/// %d = shufflevector %a, %b, <1 3> ; = +/// %e = add %c, %d +/// +/// When we transpose the operands, we bundle together all operands of one or +/// more instructions, rather than bundling together a single operand (e.g, +/// operand-zero) of several instructions. Conceptually, this is a row-wise +/// bundle instead of a column-wise bundle, hence the name "transpose". If a +/// transpose is performed while building a vectorizable tree, we must +/// re-transpose the vectors at run-time using shufflevector instructions to +/// restore the correct bundling. +/// +/// This function determines if a transpose is needed for a given bundle, and +/// if so, returns a list of isomorphic "operand" bundles. +static Optional> +transposeBinOpBundle(ArrayRef VL) { + assert(!VL.empty() && isa(VL[0]) && "No binary operator"); + assert(getSameOpcode(VL).Opcode && "No single opcode"); + assert(!getSameOpcode(VL).IsAltShuffle && "Has alternate opocde"); + + if (VL.size() < 2) + return None; + + // Check that for each instruction in the given bundle, its operands are also + // instructions having identical opcodes. + if (any_of(VL, [](Value *V) { + auto *VLI = cast(V); + auto *VLIOp0 = dyn_cast(VLI->getOperand(0)); + auto *VLIOp1 = dyn_cast(VLI->getOperand(1)); + return !VLIOp0 || !VLIOp1 || VLIOp0->getOpcode() != VLIOp1->getOpcode(); + })) + return None; + + auto *VL0 = cast(VL[0]); + auto *VL0Op0 = cast(VL0->getOperand(0)); + auto *VL0Op1 = cast(VL0->getOperand(1)); + + // Check if the operand-zero bundle or the operand-one bundle is isomorphic. + // If this is the case, a transpose is unnecessary since at least one of the + // bundles is already vectorizable. + bool Op0IsIsomorphic = true; + bool Op1IsIsomorphic = true; + for (Value *V : VL.drop_front()) { + auto *VLIOp0 = cast(cast(V)->getOperand(0)); + auto *VLIOp1 = cast(cast(V)->getOperand(1)); + Op0IsIsomorphic &= VLIOp0->getOpcode() == VL0Op0->getOpcode(); + Op1IsIsomorphic &= VLIOp1->getOpcode() == VL0Op1->getOpcode(); + if (!Op0IsIsomorphic && !Op1IsIsomorphic) + break; + } + if (Op0IsIsomorphic || Op1IsIsomorphic) + return None; + + // Group the operands according to their opcode. If the size of any group is + // not a power of two, we aren't going to transpose. Note that we've already + // shown that both operands of a given instruction have the same opcode. + MapVector Opcode2Operands; + for (Value *V : VL) { + auto *VLIOp0 = cast(cast(V)->getOperand(0)); + auto *VLIOp1 = cast(cast(V)->getOperand(1)); + unsigned SameOpcode = VLIOp0->getOpcode(); + assert(VLIOp1->getOpcode() == SameOpcode && + "Operands should have the same opcode"); + Opcode2Operands[SameOpcode].push_back(VLIOp0); + Opcode2Operands[SameOpcode].push_back(VLIOp1); + } + if (any_of(Opcode2Operands, + [](std::pair &Entry) { + return !isPowerOf2_32(Entry.second.size()); + })) + return None; + + // Find the size of the smallest isomorphic bundle. We will partition larger + // bundles so that each bundle is the same size. + unsigned MinSize = + std::min_element(Opcode2Operands.begin(), Opcode2Operands.end(), + [](std::pair &Entry1, + std::pair &Entry2) { + return Entry1.second.size() < Entry2.second.size(); + }) + ->second.size(); + + // Collect the final bundles while ensuring that each bundle contains MinSize + // values. + SmallVector Bundles; + for (const std::pair &Entry : Opcode2Operands) { + ArrayRef Bundle(Entry.second); + while (Bundle.size() >= MinSize) { + ArrayRef Slice = Bundle.take_front(MinSize); + Bundles.emplace_back(Slice.begin(), Slice.end()); + Bundle = Bundle.drop_front(MinSize); + } + } + + return Bundles; +} + +/// Compute the cost of the shufflevector instructions needed to re-transpose +/// binary operator bundles at run-time. +/// +/// Unlike a normal binary operator bundle having two input bundles (one for +/// each operand), a transposed binary operator bundle can have more than two +/// (i.e., \p NumBundles) input bundles. We generate code for the transpose by +/// concatenating the transposed input bundles together, and then shuffling out +/// vectors corresponding to the operand-zero and operand-one bundles of the +/// binary operator. So we have two kinds of shuffles to account for. We use +/// the SK_InsertSubvector ShuffleKind for concatenating vectors, and the +/// SK_Transpose ShuffleKind for the transpose. +static unsigned getTransposedBinOpShuffleCost(TargetTransformInfo *TTI, + unsigned BundleVF, + unsigned NumBundles, + VectorType *VecTy) { + assert(isPowerOf2_32(NumBundles) && "NumBundles is not a power of 2"); + + // Compute the cost of concatenating NumBundles together, where each vector + // initially contains BundleVF elements. The result of the concatenation is + // two vectors with type VecTy that will be used as inputs to the + // shufflevector instructions performing the actual transpose. We assume the + // concatenation will be performed in ~log2(NumBundles) steps. + unsigned Cost = 0; + for (unsigned NumDstVecs = NumBundles / 2, ElemsPerSrcVec = BundleVF; + NumDstVecs >= 2; NumDstVecs /= 2, ElemsPerSrcVec *= 2) { + Type *SrcTy = VectorType::get(VecTy->getElementType(), ElemsPerSrcVec); + Type *DstTy = VectorType::get(VecTy->getElementType(), 2 * ElemsPerSrcVec); + Cost += NumDstVecs * + TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, DstTy, + ElemsPerSrcVec, SrcTy); + } + + // Compute the cost of the transpose. The transpose requires two + // shufflevector instructions: one to compute the operand-zero bundle and + // another to compute the operand-one bundle. + Cost += 2 * TTI->getShuffleCost(TargetTransformInfo::SK_Transpose, VecTy); + + return Cost; +} + void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; @@ -1794,9 +1973,27 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + // If the operand bundles of this binary operator are not vectorizable, + // determine if we can transpose them, and if so, continue building the + // tree with the transposed bundles. That there is an instruction bundle + // for which we can compute a number of transposable operand bundles, + // implies that the two operands of each instruction in the bundle have + // the same opcode. This is mutually exclusive with swapping commutative + // operands, which we attempt to do next. + if (isa(VL0)) + if (Optional> TransposedBundles = + transposeBinOpBundle(VL)) { + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies, + /*ReorderIndices=*/None, *TransposedBundles); + for (const auto &Bundle : *TransposedBundles) + buildTree_rec(Bundle, Depth + 1, UserTreeIdx); + return; + } + + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { @@ -2293,33 +2490,46 @@ TargetTransformInfo::OperandValueProperties Op2VP = TargetTransformInfo::OP_None; - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; + unsigned TransposeShuffleCost = 0; + SmallVector Operands; + + if (!E->TransposedOperands.empty()) { + // If this binary operator's operand bundles can be transposed, we need + // to account for the shufflevector instructions that will perform the + // transpose operation. + TransposeShuffleCost = + getTransposedBinOpShuffleCost(TTI, E->TransposedOperands[0].size(), + E->TransposedOperands.size(), VecTy); + } else { + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; + + Operands.append(VL0->value_op_begin(), VL0->value_op_end()); } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - SmallVector Operands(VL0->operand_values()); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * @@ -2332,7 +2542,7 @@ Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); - return ReuseShuffleCost + VecCost - ScalarCost; + return ReuseShuffleCost + TransposeShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -3334,21 +3544,43 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: { - ValueList LHSVL, RHSVL; - if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, - RHSVL); - else - for (Value *V : E->Scalars) { - auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } - setInsertPointAfterBundle(E->Scalars, VL0); - - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = nullptr; + Value *RHS = nullptr; + if (!E->TransposedOperands.empty()) { + // If this binary operator's operand bundles can be transposed, + // generate the shufflevector instructions that will perform the + // transpose operation. + // + // First, vectorize the transposed bundles and concatenate them + // together to form a single vector. + ValueList VectorizedValues; + for (const ValueList &Bundle : E->TransposedOperands) + VectorizedValues.push_back(vectorizeTree(Bundle)); + Value *Merged = concatenateVectors(Builder, VectorizedValues); + + // Next, perform the actual transpose by shuffling out the operand-zero + // and operand-one vectors using even and odd transpose masks. + Value *Undef = UndefValue::get(Merged->getType()); + unsigned VF = cast(Merged->getType())->getNumElements() / 2; + Constant *LHSMask = createTransposeMask(Builder, VF, /*IsOdd=*/false); + Constant *RHSMask = createTransposeMask(Builder, VF, /*IsOdd=*/true); + LHS = Builder.CreateShuffleVector(Merged, Undef, LHSMask); + RHS = Builder.CreateShuffleVector(Merged, Undef, RHSMask); + } else { + ValueList LHSVL, RHSVL; + if (VL0->isCommutative()) { + reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, RHSVL); + } else { + for (Value *V : E->Scalars) { + auto *I = cast(V); + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } + } + LHS = vectorizeTree(LHSVL); + RHS = vectorizeTree(RHSVL); + } if (E->VectorizedValue) { DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -5,19 +5,12 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> %v1, i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1 -; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> %v0, %v1 +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -36,21 +29,17 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* %a, i64 1 -; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* %b, i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* %c, i64 1 -; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* %a, align 8 -; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 -; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* %b, align 8 -; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* %c, align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* %a to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* %b to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* %c to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 @@ -76,31 +65,12 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1 -; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2 -; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1 -; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2 -; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP0_3:%.*]] = add i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 -; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -131,20 +101,12 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_0]], i32 2 -; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -166,26 +128,14 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> %v0, %v1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> %v0, %v1 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> [[SHUFFLE1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> [[SHUFFLE1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -208,26 +158,16 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> %v0, %v1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> %v0, %v1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP1_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -254,38 +194,18 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1 -; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2 -; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1 -; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2 -; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP0_3:%.*]] = add i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0_0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0_2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1_2]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1_3]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1