Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -317,6 +317,10 @@ return nullptr; } +// FIXME: The following helper functions have multiple implementations +// in the project. They can be effectively organized in a common Load/Store +// utilities unit. + /// A helper function that returns the pointer operand of a load or store /// instruction. static Value *getPointerOperand(Value *I) { @@ -327,6 +331,34 @@ return nullptr; } +/// A helper function that returns the type of loaded or stored value. +static Type *getMemInstValueType(Value *I) { + assert((isa(I) || isa(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast(I)) + return LI->getType(); + return cast(I)->getValueOperand()->getType(); +} + +/// A helper function that returns the alignment of load or store instruction. +static unsigned getMemInstAlignment(Value *I) { + assert((isa(I) || isa(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast(I)) + return LI->getAlignment(); + return cast(I)->getAlignment(); +} + +/// A helper function that returns the address space of the pointer operand of +/// load or store instruction. +static unsigned getMemInstAddressSpace(Value *I) { + assert((isa(I) || isa(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast(I)) + return LI->getPointerAddressSpace(); + return cast(I)->getPointerAddressSpace(); +} + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. @@ -1612,12 +1644,6 @@ /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } - - /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } - /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1694,15 +1720,9 @@ /// instructions that may divide by zero. bool isScalarWithPredication(Instruction *I); - /// Returns true if \p I is a memory instruction that has a consecutive or - /// consecutive-like pointer operand. Consecutive-like pointers are pointers - /// that are treated like consecutive pointers during vectorization. The - /// pointer operands of interleaved accesses are an example. - bool hasConsecutiveLikePtrOperand(Instruction *I); - - /// Returns true if \p I is a memory instruction that must be scalarized - /// during vectorization. - bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); private: /// Check if a single basic block loop is vectorizable. @@ -1720,24 +1740,6 @@ /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the instructions that are uniform after vectorization. An - /// instruction is uniform if we represent it with a single scalar value in - /// the vectorized loop corresponding to each vector iteration. Examples of - /// uniform instructions include pointer operands of consecutive or - /// interleaved memory accesses. Note that although uniformity implies an - /// instruction will be scalar, the reverse is not true. In general, a - /// scalarized instruction will be represented by VF scalar values in the - /// vectorized loop, each corresponding to an iteration of the original - /// scalar loop. - void collectLoopUniforms(); - - /// Collect the instructions that are scalar after vectorization. An - /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. - void collectLoopScalars(); - /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1827,12 +1829,6 @@ /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; - /// Holds the instructions known to be uniform after vectorization. - SmallPtrSet Uniforms; - - /// Holds the instructions known to be scalar after vectorization. - SmallPtrSet Scalars; - /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1889,6 +1885,15 @@ unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); + /// Memory access instruction may be vectorized in more than one way. + /// Form of instruction after vectorization depends on cost. + /// This function takes cost-based decisions for Load/Store instructions + /// and collects them in a map. This decisions map is used for building + /// the lists of loop-uniform and loop-scalar instructions. + /// The calculated cost is saved with widening decision in order to + /// avoid redundant calculations. + void setCostBasedWideningDecision(unsigned VF); + /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1923,11 +1928,84 @@ return Scalars->second.count(I); } + /// Returns true if \p I is known to be uniform after vectorization. + bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); + auto UniformsPerVF = Uniforms.find(VF); + return UniformsPerVF->second.count(I); + } + + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); + auto ScalarsPerVF = Scalars.find(VF); + return ScalarsPerVF->second.count(I); + } + /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && - !Legal->isScalarAfterVectorization(I); + !isScalarAfterVectorization(I, VF); + } + + /// Decision that was taken during cost calculation for memory instruction. + enum InstWidening { + CM_Unknown, + CM_Widen, + CM_Interleave, + CM_GatherScatter, + CM_Scalarize + }; + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// instruction \p I and vector width \p VF. + void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + } + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// interleaving group \p Grp and vector width \p VF. + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + InstWidening W, unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + /// Broadcast this decicion to all instructions inside the group. + /// But the cost will be assigned to one instruction only. + for (unsigned i = 0; i < Grp->getFactor(); ++i) { + if (auto *I = Grp->getMember(i)) { + if (Grp->getInsertPos() == I) + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + else + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + } + } + } + + /// Return the cost model decision for the given instruction \p I and vector + /// width \p VF. Return CM_Unknown if this instruction did not pass + /// through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + auto Itr = WideningDecisions.find(InstOnVF); + if (Itr == WideningDecisions.end()) + return CM_Unknown; + return Itr->second.first; + } + + /// Return the vectorization cost for the given instruction \p I and vector + /// width \p VF. + unsigned getWideningCost(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated"); + return WideningDecisions[InstOnVF].second; } private: @@ -1954,6 +2032,26 @@ /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + /// Calculate vectorization cost of memory instruction \p I. + unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + + /// The cost computation for scalarized memory instruction. + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + + /// The cost computation for interleaving group of memory instructions. + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + + /// The cost computation for Gather/Scatter instruction. + unsigned getGatherScatterCost(Instruction *I, unsigned VF); + + /// The cost computation for widening instruction \p I with consecutive + /// memory access. + unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + + /// The cost calculation for Load instruction \p I with uniform pointer - + /// scalar load + broadcast. + unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1983,6 +2081,14 @@ /// vectorization factor. The entries are VF-ScalarCostTy pairs. DenseMap InstsToScalarize; + /// Holds the instructions known to be uniform after vectorization. + /// The data is collected per VF. + DenseMap> Uniforms; + + /// Holds the instructions known to be scalar after vectorization. + /// The data is collected per VF. + DenseMap> Scalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -1995,6 +2101,44 @@ /// the loop. void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. + void collectLoopUniforms(unsigned VF); + + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + + /// Keeps cost model vectorization decision and cost for instructions. + /// Right now it is used for memory instructions only. + typedef DenseMap, + std::pair> + DecisionList; + + DecisionList WideningDecisions; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2228,7 +2372,7 @@ } bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Legal->isScalarAfterVectorization(I) || + return Cost->isScalarAfterVectorization(I, VF) || Cost->isProfitableToScalarize(I, VF); } @@ -2404,7 +2548,7 @@ // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Legal->isUniformAfterVectorization(cast(EntryVal)) ? 1 : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. ScalarParts Entry(UF); @@ -2472,7 +2616,7 @@ // known to be uniform after vectorization, this corresponds to lane zero // of the last unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the last unroll iteration. - unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = cast(getScalarValue(V, UF - 1, LastLane)); // Set the insert point after the last scalarized instruction. This ensures @@ -2489,7 +2633,7 @@ // VectorLoopValueMap, we will only generate the insertelements once. for (unsigned Part = 0; Part < UF; ++Part) { Value *VectorValue = nullptr; - if (Legal->isUniformAfterVectorization(I)) { + if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); } else { VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); @@ -2518,8 +2662,9 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast(V)) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? + !Cost->isUniformAfterVectorization(cast(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested @@ -2590,15 +2735,13 @@ if (Instr != Group->getInsertPos()) return; - LoadInst *LI = dyn_cast(Instr); - StoreInst *SI = dyn_cast(Instr); Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr)); // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); @@ -2638,7 +2781,7 @@ Value *UndefVec = UndefValue::get(VecTy); // Vectorize the interleaved load group. - if (LI) { + if (isa(Instr)) { // For each unroll part, create a wide load for the group. SmallVector NewLoads; @@ -2723,33 +2866,34 @@ assert((LI || SI) && "Invalid Load/Store instruction"); - // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + LoopVectorizationCostModel::InstWidening Decision = + Cost->getWideningDecision(Instr, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == LoopVectorizationCostModel::CM_Interleave) return vectorizeInterleaveGroup(Instr); - Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getPointerOperand(Instr); - unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Alignment = getMemInstAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + unsigned AddressSpace = getMemInstAddressSpace(Instr); // Scalarize the memory instruction if necessary. - if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + if (Decision == LoopVectorizationCostModel::CM_Scalarize) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. bool CreateGatherScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); + (Decision == LoopVectorizationCostModel::CM_GatherScatter); VectorParts VectorGep; @@ -2934,7 +3078,7 @@ // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { @@ -4516,7 +4660,7 @@ // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; // These are the scalar results. Notice that we don't generate vector GEPs // because scalar GEPs result in better code. ScalarParts Entry(UF); @@ -5020,12 +5164,6 @@ if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - // Collect all instructions that are known to be uniform after vectorization. - collectLoopUniforms(); - - // Collect all instructions that are known to be scalar after vectorization. - collectLoopScalars(); - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; @@ -5291,10 +5429,18 @@ return true; } -void LoopVectorizationLegality::collectLoopScalars() { +void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { + + // We should not collect Scalars more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Scalars for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Scalars.count(VF) && + "This function should not be visited twice for the same VF"); // If an instruction is uniform after vectorization, it will remain scalar. - Scalars.insert(Uniforms.begin(), Uniforms.end()); + Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); // Collect the getelementptr instructions that will not be vectorized. A // getelementptr instruction is only vectorized if it is used for a legal @@ -5302,21 +5448,21 @@ for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (auto *GEP = dyn_cast(&I)) { - Scalars.insert(GEP); + Scalars[VF].insert(GEP); continue; } auto *Ptr = getPointerOperand(&I); if (!Ptr) continue; auto *GEP = getGEPInstruction(Ptr); - if (GEP && isLegalGatherOrScatter(&I)) - Scalars.erase(GEP); + if (GEP && getWideningDecision(&I, VF) == CM_GatherScatter) + Scalars[VF].erase(GEP); } // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *getInductionVars()) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -5324,7 +5470,7 @@ // vectorization. auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Scalars[VF].count(I); }); if (!ScalarInd) continue; @@ -5333,25 +5479,17 @@ // scalar after vectorization. auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + return I == Ind || !TheLoop->contains(I) || Scalars[VF].count(I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. - Scalars.insert(Ind); - Scalars.insert(IndUpdate); + Scalars[VF].insert(Ind); + Scalars[VF].insert(IndUpdate); } } -bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { - if (isAccessInterleaved(I)) - return true; - if (auto *Ptr = getPointerOperand(I)) - return isConsecutivePtr(Ptr); - return false; -} - bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { if (!blockNeedsPredication(I->getParent())) return false; @@ -5369,48 +5507,48 @@ return false; } -bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( - Instruction *I, unsigned VF) { - - // If the memory instruction is in an interleaved group, it will be - // vectorized and its pointer will remain uniform. - if (isAccessInterleaved(I)) - return false; - +bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); assert((LI || SI) && "Invalid memory instruction"); - // If the pointer operand is uniform (loop invariant), the memory instruction - // will be scalarized. auto *Ptr = getPointerOperand(I); - if (LI && isUniform(Ptr)) - return true; - // If the pointer operand is non-consecutive and neither a gather nor a - // scatter operation is legal, the memory instruction will be scalarized. - if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) - return true; + // In order to be widened, the pointer should be consecutive, first of all. + if (!isConsecutivePtr(Ptr)) + return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I)) - return true; + return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL, VF)) - return true; + return false; - // Otherwise, the memory instruction should be vectorized if the rest of the - // loop is. - return false; + return true; } -void LoopVectorizationLegality::collectLoopUniforms() { +void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { + + // We should not collect Uniforms more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Uniforms for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Uniforms.count(VF) && + "This function should not be visited twice for the same VF"); + + // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // not analyze again. Uniforms.count(VF) will return 1. + Uniforms[VF].clear(); + // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. @@ -5443,6 +5581,14 @@ // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; + auto isUniformDecision = [&](Instruction *I, unsigned VF) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + + return (WideningDecision == CM_Widen || + WideningDecision == CM_Interleave); + }; // Iterate over the instructions in the loop, and collect all // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible // that a consecutive-like pointer operand will be scalarized, we collect it @@ -5465,25 +5611,18 @@ return getPointerOperand(U) == Ptr; }); - // Ensure the memory instruction will not be scalarized, making its - // pointer operand non-uniform. If the pointer operand is used by some - // instruction other than a memory access, we're not going to check if - // that other instruction may be scalarized here. Thus, conservatively - // assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + // Ensure the memory instruction will not be scalarized or used by + // gather/scatter, making its pointer operand non-uniform. If the pointer + // operand is used by any instruction other than a memory access, we + // conservatively assume the pointer operand may be non-uniform. + if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) PossibleNonUniformPtrs.insert(Ptr); // If the memory instruction will be vectorized and its pointer operand - // is consecutive-like, the pointer operand should remain uniform. - else if (hasConsecutiveLikePtrOperand(&I)) - ConsecutiveLikePtrs.insert(Ptr); - - // Otherwise, if the memory instruction will be vectorized and its - // pointer operand is non-consecutive-like, the memory instruction should - // be a gather or scatter operation. Its pointer operand will be - // non-uniform. + // is consecutive-like, or interleaving - the pointer operand should + // remain uniform. else - PossibleNonUniformPtrs.insert(Ptr); + ConsecutiveLikePtrs.insert(Ptr); } // Add to the Worklist all consecutive and consecutive-like pointers that @@ -5518,7 +5657,7 @@ // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + return getPointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // For an instruction to be added into Worklist above, all its users inside @@ -5527,7 +5666,7 @@ // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : Inductions) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -5558,7 +5697,7 @@ DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } - Uniforms.insert(Worklist.begin(), Worklist.end()); + Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -5698,7 +5837,7 @@ uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); // An alignment of 0 means target ABI alignment. - unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Align = getMemInstAlignment(&I); if (!Align) Align = DL.getABITypeAlignment(PtrTy->getElementType()); @@ -5940,8 +6079,7 @@ "last group member potentially pointer-wrapping.\n"); releaseGroup(Group); } - } - else { + } else { // Case 3: A non-reversed interleaved load group with gaps: We need // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need @@ -6071,6 +6209,8 @@ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + + collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); return Factor; } @@ -6437,12 +6577,13 @@ MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); continue; } - + collectUniformsAndScalars(VFs[j]); // Count the number of live intervals. unsigned RegUsage = 0; for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst)) + if (VecValuesToIgnore.count(Inst) || + isScalarAfterVectorization(Inst, VFs[j])) continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); } @@ -6511,7 +6652,7 @@ Instruction *PredInst, DenseMap &ScalarCosts, unsigned VF) { - assert(!Legal->isUniformAfterVectorization(PredInst) && + assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6532,7 +6673,7 @@ // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - Legal->isScalarAfterVectorization(I)) + isScalarAfterVectorization(I, VF)) return false; // If the instruction is scalar with predication, it will be analyzed @@ -6552,7 +6693,7 @@ // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (Legal->isUniformAfterVectorization(J)) + if (isUniformAfterVectorization(J, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6565,7 +6706,7 @@ // and their return values are inserted into vectors. Thus, an extract would // still be required. auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); }; // Compute the expected cost discount from scalarizing the entire expression @@ -6628,6 +6769,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. collectInstsToScalarize(VF); @@ -6707,11 +6851,141 @@ Legal->hasStride(I->getOperand(1)); } +unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + auto SE = PSE.getSE(); + + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAddressSpace(I); + Value *Ptr = getPointerOperand(I); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + + Cost += VF * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + + return Cost; +} + +unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + unsigned AS = getMemInstAddressSpace(I); + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + + assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && + "Stride should be 1 or -1 for consecutive memory access"); + unsigned Cost = 0; + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + + bool Reverse = ConsecutiveStride < 0; + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + unsigned VF) { + LoadInst *LI = cast(I); + Type *ValTy = LI->getType(); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = LI->getAlignment(); + unsigned AS = LI->getPointerAddressSpace(); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); +} + +unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + + return TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); +} + +unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned AS = getMemInstAddressSpace(I); + + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it doesn't allow gaps. + SmallVector Indices; + if (isa(I)) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + unsigned VF) { + + // Calculate scalar cost only. Vectorization cost should be ready at this + // moment. + if (VF == 1) { + Type *ValTy = getMemInstValueType(I); + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAlignment(I); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS); + } + return getWideningCost(I, VF); +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (Legal->isUniformAfterVectorization(I)) + if (isUniformAfterVectorization(I, VF)) VF = 1; if (VF > 1 && isProfitableToScalarize(I, VF)) @@ -6725,6 +6999,79 @@ return VectorizationCostTy(C, TypeNotScalarized); } +void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { + if (VF == 1) + return; + for (BasicBlock *BB : TheLoop->blocks()) { + // For each instruction in the old loop. + for (Instruction &I : *BB) { + Value *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + + if (isa(&I) && Legal->isUniform(Ptr)) { + // Scalar load + broadcast + unsigned Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + continue; + } + + // We assume that widening is the best solution when possible. + if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + unsigned Cost = getConsecutiveMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Widen, Cost); + continue; + } + + // Choose between Interleaving, Gather/Scatter or Scalarization. + unsigned InterleaveCost = UINT_MAX; + unsigned NumAccesses = 1; + if (Legal->isAccessInterleaved(&I)) { + auto Group = Legal->getInterleavedAccessGroup(&I); + assert(Group && "Fail to get an interleaved access group."); + + // Make one decision for the whole group. + if (getWideningDecision(&I, VF) != CM_Unknown) + continue; + + NumAccesses = Group->getNumMembers(); + InterleaveCost = getInterleaveGroupCost(&I, VF); + } + + unsigned GatherScatterCost = + Legal->isLegalGatherOrScatter(&I) + ? getGatherScatterCost(&I, VF) * NumAccesses + : UINT_MAX; + + unsigned ScalarizationCost = + getMemInstScalarizationCost(&I, VF) * NumAccesses; + + // Choose better solution for the current VF, + // write down this decision and use it during vectorization. + unsigned Cost; + InstWidening Decision; + if (InterleaveCost <= GatherScatterCost && + InterleaveCost < ScalarizationCost) { + Decision = CM_Interleave; + Cost = InterleaveCost; + } else if (GatherScatterCost < ScalarizationCost) { + Decision = CM_GatherScatter; + Cost = GatherScatterCost; + } else { + Decision = CM_Scalarize; + Cost = ScalarizationCost; + } + // If the instructions belongs to an interleave group, the whole group + // receives the same decision. The whole group receives the cost, but + // the cost will actually be assigned to one instruction. + if (auto Group = Legal->getInterleavedAccessGroup(&I)) + setWideningDecision(Group, VF, Decision, Cost); + else + setWideningDecision(&I, VF, Decision, Cost); + } + } +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { @@ -6857,126 +7204,8 @@ } case Instruction::Store: case Instruction::Load: { - StoreInst *SI = dyn_cast(I); - LoadInst *LI = dyn_cast(I); - Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); - VectorTy = ToVectorTy(ValTy, VF); - - unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); - unsigned AS = - SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); - Value *Ptr = getPointerOperand(I); - // We add the cost of address computation here instead of with the gep - // instruction because only here we know whether the operation is - // scalarized. - if (VF == 1) - return TTI.getAddressComputationCost(VectorTy) + - TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (LI && Legal->isUniform(Ptr)) { - // Scalar load + broadcast - unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); - Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - return Cost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); - } - - // For an interleaved access, calculate the total cost of the whole - // interleave group. - if (Legal->isAccessInterleaved(I)) { - auto Group = Legal->getInterleavedAccessGroup(I); - assert(Group && "Fail to get an interleaved access group."); - - // Only calculate the cost once at the insert position. - if (Group->getInsertPos() != I) - return 0; - - unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = - VectorType::get(VectorTy->getVectorElementType(), - VectorTy->getVectorNumElements() * InterleaveFactor); - - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. - SmallVector Indices; - if (LI) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } - - // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) - Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - - // FIXME: The interleaved load group with a huge gap could be even more - // expensive than scalar operations. Then we could ignore such group and - // use scalar operations instead. - return Cost; - } - - // Check if the memory instruction will be scalarized. - if (Legal->memoryInstructionMustBeScalarized(I, VF)) { - unsigned Cost = 0; - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - - // Figure out whether the access is strided and get the stride value - // if it's known in compile time - const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); - - // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - - // Get the overhead of the extractelement and insertelement instructions - // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); - - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. - if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); - - return Cost; - } - - // Determine if the pointer operand of the access is either consecutive or - // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. - bool UseGatherOrScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); - - unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (UseGatherOrScatter) { - assert(ConsecutiveStride == 0 && - "Gather/Scatter are not used for consecutive stride"); - return Cost + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); - } - // Wide load/stores. - if (Legal->isMaskRequired(I)) - Cost += - TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - return Cost; + VectorTy = ToVectorTy(getMemInstValueType(I), VF); + return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: case Instruction::SExt: @@ -7079,19 +7308,6 @@ SmallPtrSetImpl &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - - // Insert values known to be scalar into VecValuesToIgnore. This is a - // conservative estimation of the values that will later be scalarized. - // - // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may - // still be scalarized. For example, we may find an instruction to be - // more profitable for a given vectorization factor if it were to be - // scalarized. But at this point, we haven't yet computed the - // vectorization factor. - for (auto *BB : TheLoop->getBlocks()) - for (auto &I : *BB) - if (Legal->isScalarAfterVectorization(&I)) - VecValuesToIgnore.insert(&I); } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + Index: llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -18,8 +18,9 @@ ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 ; CHECK: vector.body: +; CHECK: %index = phi i64 ; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0 +; CHECK: %[[T0:.+]] = mul i64 %index, 5 ; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] ; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* ; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 Index: llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +;