Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -278,36 +278,17 @@ return Ty; } -/// \returns True if the ExtractElement instructions in VL can be vectorized -/// to use the original vector. -static bool CanReuseExtract(ArrayRef VL) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - unsigned NElts = Vec->getType()->getVectorNumElements(); - - if (NElts != VL.size()) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); +/// \returns True if Extract{Value,Element} instruction extracts element Idx. +static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { + assert(Opcode == Instruction::ExtractElement || + Opcode == Instruction::ExtractValue); + if (Opcode == Instruction::ExtractElement) { ConstantInt *CI = dyn_cast(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; + return CI && CI->getZExtValue() == Idx; + } else { + ExtractValueInst *EI = cast(E); + return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx; } - - return true; } /// \returns True if in-tree use also needs extract. This refers to @@ -373,6 +354,18 @@ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); + // Use the vector register size specified by the target unless overridden + // by a command-line option. + // TODO: It would be better to limit the vectorization factor based on + // data type rather than just register size. For example, x86 AVX has + // 256-bit registers, but it does not support integer operations + // at that width (that requires AVX2). + if (MaxVectorRegSizeOption.getNumOccurrences()) + MaxVecRegSize = MaxVectorRegSizeOption; + else + MaxVecRegSize = TTI->getRegisterBitWidth(true); + + MinVecRegSize = MinVectorRegSizeOption; } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -426,6 +419,21 @@ /// vectorizable tree. void computeMinimumValueSizes(); + // \returns maximum vector register size as set by TTI or overridden by cl::opt. + unsigned getMaxVecRegSize() const { + return MaxVecRegSize; + } + + // \returns minimum vector register size as set by cl::opt. + unsigned getMinVecRegSize() const { + return MinVecRegSize; + } + + /// \brief Check if ArrayType or StructType is isomorphic to some VectorType. + /// + /// \returns number of elements in vector if isomorphism exists, 0 otherwise. + unsigned canMapToVector(Type *T, const DataLayout &DL) const; + private: struct TreeEntry; @@ -435,6 +443,10 @@ /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth); + /// \returns True if the ExtractElement/ExtractValue instructions in VL can + /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). + bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; + /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -926,6 +938,8 @@ AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. + unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; @@ -1159,8 +1173,9 @@ } return; } + case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = CanReuseExtract(VL); + bool Reuse = canReuseExtract(VL, Opcode); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); } else { @@ -1477,6 +1492,74 @@ } } +unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { + unsigned N; + Type *EltTy; + auto *ST = dyn_cast(T); + if (ST) { + N = ST->getNumElements(); + EltTy = *ST->element_begin(); + } else { + N = cast(T)->getNumElements(); + EltTy = cast(T)->getElementType(); + } + if (!isValidElementType(EltTy)) + return 0; + uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); + if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) + return 0; + if (ST) { + // Check that struct is homogeneous + for (const auto *Ty : ST->elements()) + if (Ty != EltTy) + return 0; + } + return N; +} + +bool BoUpSLP::canReuseExtract(ArrayRef VL, unsigned Opcode) const { + assert(Opcode == Instruction::ExtractElement || + Opcode == Instruction::ExtractValue); + assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + Instruction *E0 = cast(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from a vector/aggregate with the same number of elements. + unsigned NElts; + if (Opcode == Instruction::ExtractValue) { + const DataLayout &DL = E0->getModule()->getDataLayout(); + NElts = canMapToVector(Vec->getType(), DL); + if (!NElts) + return false; + // Check if load can be rewritten as load of vector + LoadInst *LI = dyn_cast(Vec); + if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) + return false; + } else { + NElts = Vec->getType()->getVectorNumElements(); + } + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + if (!matchExtractIndex(E0, 0, Opcode)) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + Instruction *E = cast(VL[i]); + if (!matchExtractIndex(E, i, Opcode)) + return false; + if (E->getOperand(0) != Vec) + return false; + } + + return true; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; @@ -1506,11 +1589,12 @@ case Instruction::PHI: { return 0; } + case Instruction::ExtractValue: case Instruction::ExtractElement: { - if (CanReuseExtract(VL)) { + if (canReuseExtract(VL, Opcode)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); + Instruction *E = cast(VL[i]); if (E->hasOneUse()) // Take credit for instruction that will become dead. DeadCost += @@ -2198,13 +2282,25 @@ } case Instruction::ExtractElement: { - if (CanReuseExtract(E->Scalars)) { + if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) { Value *V = VL0->getOperand(0); E->VectorizedValue = V; return V; } return Gather(E->Scalars, VecTy); } + case Instruction::ExtractValue: { + if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { + LoadInst *LI = cast(VL0->getOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + E->VectorizedValue = V; + return propagateMetadata(V, E->Scalars); + } + return Gather(E->Scalars, VecTy); + } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -3418,19 +3514,6 @@ if (!TTI->getNumberOfRegisters(true)) return false; - // Use the vector register size specified by the target unless overridden - // by a command-line option. - // TODO: It would be better to limit the vectorization factor based on - // data type rather than just register size. For example, x86 AVX has - // 256-bit registers, but it does not support integer operations - // at that width (that requires AVX2). - if (MaxVectorRegSizeOption.getNumOccurrences()) - MaxVecRegSize = MaxVectorRegSizeOption; - else - MaxVecRegSize = TTI->getRegisterBitWidth(true); - - MinVecRegSize = MinVectorRegSizeOption; - // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -3538,9 +3621,6 @@ /// The getelementptr instructions in a basic block organized by base pointer. WeakVHListMap GEPs; - - unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. - unsigned MinVecRegSize; // Set by cl::opt (default: 128). }; /// \brief Check that the Values in the slice in VL array are still existent in @@ -3658,7 +3738,7 @@ // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); @@ -3731,7 +3811,7 @@ // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = R.getMinVecRegSize() / Sz; for (Value *V : VL) { Type *Ty = V->getType(); @@ -3798,13 +3878,14 @@ for (auto &V : BuildVectorSlice) { IRBuilder Builder(InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); - InsertElementInst *IE = cast(V); + Instruction *I = cast(V); + assert(isa(I) || isa(I)); Instruction *Extract = cast(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); - IE->setOperand(1, Extract); - IE->removeFromParent(); - IE->insertAfter(Extract); - InsertAfter = IE; + I->setOperand(1, Extract); + I->removeFromParent(); + I->insertAfter(Extract); + InsertAfter = I; } } // Move to the next bundle. @@ -4201,6 +4282,25 @@ return false; } +/// \brief Like findBuildVector, but looks backwards for construction of aggregate. +/// +/// \return true if it matches. +static bool findBuildAggregate(InsertValueInst *IV, + SmallVectorImpl &BuildVector, + SmallVectorImpl &BuildVectorOpds) { + if (!IV->hasOneUse()) + return false; + Value *V = IV->getAggregateOperand(); + if (!isa(V)) { + InsertValueInst *I = dyn_cast(V); + if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds)) + return false; + } + BuildVector.push_back(IV); + BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + return true; +} + static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } @@ -4357,7 +4457,7 @@ continue; // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) { + if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4385,7 +4485,7 @@ if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - MinVecRegSize) || + R.getMinVecRegSize()) || tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); @@ -4453,6 +4553,28 @@ continue; } + + // Try to vectorize trees that start at insertvalue instructions feeding into + // a store. + if (StoreInst *SI = dyn_cast(it)) { + if (InsertValueInst *LastInsertValue = dyn_cast(SI->getValueOperand())) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) { + SmallVector BuildVector; + SmallVector BuildVectorOpds; + if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) + continue; + + DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + } + continue; + } + } + } } return Changed; Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -0,0 +1,189 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +; CHECK-LABEL: julia_2xdouble +; CHECK: load <2 x double> +; CHECK: load <2 x double> +; CHECK: fmul <2 x double> +; CHECK: fadd <2 x double> +define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) { +top: + %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0 + %x0 = load double, double* %px0, align 4 + %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0 + %y0 = load double, double* %py0, align 4 + %m0 = fmul double %x0, %y0 + %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1 + %x1 = load double, double* %px1, align 4 + %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1 + %y1 = load double, double* %py1, align 4 + %m1 = fmul double %x1, %y1 + %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0 + %z0 = load double, double* %pz0, align 4 + %a0 = fadd double %m0, %z0 + %i0 = insertvalue [2 x double] undef, double %a0, 0 + %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1 + %z1 = load double, double* %pz1, align 4 + %a1 = fadd double %m1, %z1 + %i1 = insertvalue [2 x double] %i0, double %a1, 1 + store [2 x double] %i1, [2 x double]* %0, align 4 + ret void +} + +; CHECK-LABEL: julia_4xfloat +; CHECK: load <4 x float> +; CHECK: load <4 x float> +; CHECK: fmul <4 x float> +; CHECK: fadd <4 x float> +define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) { +top: + %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0 + %x0 = load float, float* %px0, align 4 + %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0 + %y0 = load float, float* %py0, align 4 + %m0 = fmul float %x0, %y0 + %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1 + %x1 = load float, float* %px1, align 4 + %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1 + %y1 = load float, float* %py1, align 4 + %m1 = fmul float %x1, %y1 + %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2 + %x2 = load float, float* %px2, align 4 + %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2 + %y2 = load float, float* %py2, align 4 + %m2 = fmul float %x2, %y2 + %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3 + %x3 = load float, float* %px3, align 4 + %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3 + %y3 = load float, float* %py3, align 4 + %m3 = fmul float %x3, %y3 + %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0 + %z0 = load float, float* %pz0, align 4 + %a0 = fadd float %m0, %z0 + %i0 = insertvalue [4 x float] undef, float %a0, 0 + %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1 + %z1 = load float, float* %pz1, align 4 + %a1 = fadd float %m1, %z1 + %i1 = insertvalue [4 x float] %i0, float %a1, 1 + %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2 + %z2 = load float, float* %pz2, align 4 + %a2 = fadd float %m2, %z2 + %i2 = insertvalue [4 x float] %i1, float %a2, 2 + %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3 + %z3 = load float, float* %pz3, align 4 + %a3 = fadd float %m3, %z3 + %i3 = insertvalue [4 x float] %i2, float %a3, 3 + store [4 x float] %i3, [4 x float]* %0, align 4 + ret void +} + +; CHECK-LABEL: julia_load_array_of_float +; CHECK: fsub <4 x float> +define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) { +top: + %a_arr = load [4 x float], [4 x float]* %a, align 4 + %a0 = extractvalue [4 x float] %a_arr, 0 + %a2 = extractvalue [4 x float] %a_arr, 2 + %a1 = extractvalue [4 x float] %a_arr, 1 + %b_arr = load [4 x float], [4 x float]* %b, align 4 + %b0 = extractvalue [4 x float] %b_arr, 0 + %b2 = extractvalue [4 x float] %b_arr, 2 + %b1 = extractvalue [4 x float] %b_arr, 1 + %a3 = extractvalue [4 x float] %a_arr, 3 + %c1 = fsub float %a1, %b1 + %b3 = extractvalue [4 x float] %b_arr, 3 + %c0 = fsub float %a0, %b0 + %c2 = fsub float %a2, %b2 + %c_arr0 = insertvalue [4 x float] undef, float %c0, 0 + %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1 + %c3 = fsub float %a3, %b3 + %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2 + %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3 + store [4 x float] %c_arr3, [4 x float]* %c, align 4 + ret void +} + +; CHECK-LABEL: julia_load_array_of_i32 +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: sub <4 x i32> +define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) { +top: + %a_arr = load [4 x i32], [4 x i32]* %a, align 4 + %a0 = extractvalue [4 x i32] %a_arr, 0 + %a2 = extractvalue [4 x i32] %a_arr, 2 + %a1 = extractvalue [4 x i32] %a_arr, 1 + %b_arr = load [4 x i32], [4 x i32]* %b, align 4 + %b0 = extractvalue [4 x i32] %b_arr, 0 + %b2 = extractvalue [4 x i32] %b_arr, 2 + %b1 = extractvalue [4 x i32] %b_arr, 1 + %a3 = extractvalue [4 x i32] %a_arr, 3 + %c1 = sub i32 %a1, %b1 + %b3 = extractvalue [4 x i32] %b_arr, 3 + %c0 = sub i32 %a0, %b0 + %c2 = sub i32 %a2, %b2 + %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0 + %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1 + %c3 = sub i32 %a3, %b3 + %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2 + %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3 + store [4 x i32] %c_arr3, [4 x i32]* %c, align 4 + ret void +} + +; Almost identical to previous test, but for type that should NOT be vectorized. +; +; CHECK-LABEL: julia_load_array_of_i16 +; CHECK-NOT: i2> +define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) { +top: + %a_arr = load [4 x i16], [4 x i16]* %a, align 4 + %a0 = extractvalue [4 x i16] %a_arr, 0 + %a2 = extractvalue [4 x i16] %a_arr, 2 + %a1 = extractvalue [4 x i16] %a_arr, 1 + %b_arr = load [4 x i16], [4 x i16]* %b, align 4 + %b0 = extractvalue [4 x i16] %b_arr, 0 + %b2 = extractvalue [4 x i16] %b_arr, 2 + %b1 = extractvalue [4 x i16] %b_arr, 1 + %a3 = extractvalue [4 x i16] %a_arr, 3 + %c1 = sub i16 %a1, %b1 + %b3 = extractvalue [4 x i16] %b_arr, 3 + %c0 = sub i16 %a0, %b0 + %c2 = sub i16 %a2, %b2 + %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0 + %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1 + %c3 = sub i16 %a3, %b3 + %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2 + %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3 + store [4 x i16] %c_arr3, [4 x i16]* %c, align 4 + ret void +} + +%pseudovec = type { float, float, float, float } + +; CHECK-LABEL: julia_load_struct_of_float +; CHECK: load <4 x float> +; CHECK: load <4 x float> +; CHECK: fsub <4 x float> +define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) { +top: + %a_struct = load %pseudovec, %pseudovec* %a, align 4 + %a0 = extractvalue %pseudovec %a_struct, 0 + %a1 = extractvalue %pseudovec %a_struct, 1 + %b_struct = load %pseudovec, %pseudovec* %b, align 4 + %a2 = extractvalue %pseudovec %a_struct, 2 + %b0 = extractvalue %pseudovec %b_struct, 0 + %a3 = extractvalue %pseudovec %a_struct, 3 + %c0 = fsub float %a0, %b0 + %b1 = extractvalue %pseudovec %b_struct, 1 + %b2 = extractvalue %pseudovec %b_struct, 2 + %c1 = fsub float %a1, %b1 + %c_struct0 = insertvalue %pseudovec undef, float %c0, 0 + %b3 = extractvalue %pseudovec %b_struct, 3 + %c3 = fsub float %a3, %b3 + %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1 + %c2 = fsub float %a2, %b2 + %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2 + %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3 + store %pseudovec %c_struct3, %pseudovec* %c, align 4 + ret void +}