Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -246,12 +246,13 @@ public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned VecWidth, + const TargetTransformInfo *TTI, + LoopVectorizationCostModel *CostModel, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + Legal(nullptr), AddedSafetyChecks(false), CM(CostModel) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -452,14 +453,19 @@ // Record whether runtime check is added. bool AddedSafetyChecks; + + LoopVectorizationCostModel *CM; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + const TargetTransformInfo *TTI, + LoopVectorizationCostModel *CostModel, + unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, CostModel, 1, + UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -880,6 +886,11 @@ /// \return information about the register usage of the loop. RegisterUsage calculateRegisterUsage(); + /// \return A map of instructions that can be used with smaller types. + std::map& getClampedInstrTys(unsigned VF) { + return ClampedInstrTys[VF]; + } + private: /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different @@ -887,6 +898,19 @@ /// the factor width. unsigned expectedCost(unsigned VF); + /// Return the vectorized type or a clamped type that may have been + /// previously identified. + Type* getClampedVectorTy(Instruction *I, unsigned VF); + + /// Expects a CastInst, returns if it has been found to be free once + /// vectorized. + bool isFreeCast(Instruction *I, unsigned VF); + + /// Search from I looking for a chain of calculations which can use + /// smaller types than what the IR currently presents. + bool isNarrowInstruction(Instruction *I, unsigned VF, Type *NarrowVecTy, + unsigned ScalarWidth); + /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); @@ -921,6 +945,12 @@ const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; + + // While searching from truncs, we store instructions which can use + // smaller types when vectorized. + std::map> ClampedInstrTys; + // Sext and Zext instructions that will disappear due to vectorization. + std::map> FreeCasts; }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1414,11 +1444,11 @@ // We decided not to vectorize, but we may want to unroll. - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, &CM, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, &CM, VF.Width, UF); LB.vectorize(&LVL); ++LoopsVectorized; @@ -1858,7 +1888,7 @@ bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : - UndefValue::get(VectorType::get(Instr->getType(), VF)); + UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); @@ -2553,6 +2583,73 @@ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +// Returns the new instruction that is a clamped version of 'I', if it is a +// binary operation. Otherwise it just returns I. +Instruction* +visitClampedInstr(LoopVectorizationCostModel *CM, Instruction *I, Type *Ty, + unsigned VF) { + static unsigned insertNum = 0; + auto &ClampedVecTys = CM->getClampedInstrTys(VF); + + // Recursive through the chain. + for (auto II = I->op_begin(), IE = I->op_end(); II != IE; ++II) { + if (Instruction *Operand = dyn_cast(II)) { + if (ClampedVecTys.count(Operand)) { + Type *Ty = ClampedVecTys[Operand]->getScalarType(); + Instruction *Clamped = visitClampedInstr(CM, Operand, Ty, VF); + // Replace the operand with the new value if it is different. + if (Clamped != I) + *II = (Value*)Clamped; + } + } + } + + // By inserting truncs and extends, the binary operations in the chain can + // be performed on smaller types. These extra CastInsts should hopefully + // then cancel out the exisiting Casts that extend for loads and trunc for + // stores. + if (I->isBinaryOp()) { + IRBuilder<> builder(I); + Value *Op0 = I->getOperand(0); + Value *Op1 = I->getOperand(1); + Value *T0 = nullptr; + Value *T1 = nullptr; + if (I->getOperand(0)->getType() != Ty) + T0 = CastInst::CreateTruncOrBitCast(Op0, Ty, Twine(insertNum++), I); + else + T0 = Op0; + if (I->getOperand(1)->getType() != Ty) + T1 = CastInst::CreateTruncOrBitCast(Op1, Ty, Twine(insertNum++), I); + else + T1 = Op1; + + Value *Bin = BinaryOperator::Create(cast(I)->getOpcode(), + T0, T1, Twine(insertNum++), I); + return CastInst::CreateIntegerCast(Bin, I->getType(), false, + Twine(insertNum++), I); + + } + return I; +} + +// Iterate through the map of clamped instruction types for the given +// vectorization factor, then perform the clamping from the truncs within +// the instruction set. +void adjustForClampedInstrs(LoopVectorizationCostModel *CM, unsigned VF) { + auto &ClampedVecTys = CM->getClampedInstrTys(VF); + for (auto CV : ClampedVecTys) { + Instruction *I = CV.first; + if (isa(I)) { + Type *Ty = CV.second->getScalarType(); + Instruction *NewVal = visitClampedInstr(CM, I, Ty, VF); + auto OldVal = I->op_begin(); + // Replace with new value if it is different. + if (I != NewVal) + *OldVal = NewVal; + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -2578,6 +2675,10 @@ LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); + // The CostModel may have identified operations that could be executed + // using smaller types, so convert those operations to use them. + adjustForClampedInstrs(CM, VF); + // Vectorize all of the blocks in the original loop. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) @@ -4045,6 +4146,7 @@ } unsigned LoopVectorizationCostModel::getWidestType() { + unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4365,23 +4467,28 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; + ClampedInstrTys.insert(std::make_pair(VF, std::map())); + // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { unsigned BlockCost = 0; BasicBlock *BB = *bb; - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // For each instruction in the old loop. This has been reversed so that + // the search can begin from trunc insts so that we can find free casts + // and smaller types for operations. + for (BasicBlock::reverse_iterator it = BB->rbegin(), e = BB->rend(); + it != e; ++it) { // Skip dbg intrinsics. - if (isa(it)) + if (isa(*it)) continue; // Ignore ephemeral values. - if (EphValues.count(it)) + if (EphValues.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4463,6 +4570,95 @@ return false; } +// The search for free casts and smaller types begins from truncs towards the +// possible extend instructions and during the search, the extend instructions +// are saved. +bool LoopVectorizationCostModel::isFreeCast(Instruction *I, unsigned VF) { + return FreeCasts[VF].count(I); +} + +bool +LoopVectorizationCostModel::isNarrowInstruction(Instruction *I, + unsigned VF, + Type *NarrowVecTy, + unsigned ScalarWidth) { + if (isa(I)) + return false; + + if (ClampedInstrTys[VF].count(I)) + return true; + + // We only care about integer operations. + Type *DstTy = I->getType(); + if (!DstTy->isIntegerTy()) + return false; + + const APInt MaxScalarValue = APInt::getMaxValue(ScalarWidth); + SmallVector PossibleFreeCasts; + unsigned NarrowOperands = 0; + + // Search the operands of the instruction and look for sext/zext or + // constants within the size limit. + for (Value *Opr : I->operands()) { + if (auto *ConstInt = dyn_cast(Opr)) { + if (ScalarWidth == ConstInt->getValue().getBitWidth()) + if (MaxScalarValue.ult(ConstInt->getValue())) + break; + + ++NarrowOperands; + + } else if (auto *Cast = dyn_cast(Opr)) { + // We only care about integer operations. + if (!Cast->isIntegerCast()) + break; + + ++NarrowOperands; + + // Check whether the original size of the sext/zext is the same + // as the trunc destination size. + if (Cast->getSrcTy() == DstTy) + PossibleFreeCasts.push_back(Cast); + else { + // If not, we still save the clamped size so later the + // zext/sext instruction knows what size is necessary. + ClampedInstrTys[VF].insert(std::make_pair(Cast, NarrowVecTy)); + } + } else if (auto *NextInstruction = dyn_cast(Opr)) { + unsigned NextOpc = NextInstruction->getOpcode(); + if (NextOpc == Instruction::Mul || + NextOpc == Instruction::Add || + NextOpc == Instruction::And || + NextOpc == Instruction::Or || + NextOpc == Instruction::Xor || + NextOpc == Instruction::Shl) { + if (isNarrowInstruction(NextInstruction, VF, NarrowVecTy, ScalarWidth)) + ++NarrowOperands; + } else + break; + } else + break; + } + // If all the operands to this instruction are narrow, make the trunc + // and the casts free and assign a clamped VectorTy to the ChainInst. + if (NarrowOperands == I->getNumOperands()) { + for (auto *FreeCast : PossibleFreeCasts) + FreeCasts[VF].insert(FreeCast); + ClampedInstrTys[VF].insert(std::make_pair(I, NarrowVecTy)); + DEBUG(dbgs() << "LV: Found narrow value for = " << I->getName() << "\n"); + return true; + } + + return false; +} + +Type* +LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) { + if (ClampedInstrTys[VF].count(I)) + return ClampedInstrTys[VF][I]; + else + return ToVectorTy(I->getType(), VF); +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4471,7 +4667,7 @@ VF = 1; Type *RetTy = I->getType(); - Type *VectorTy = ToVectorTy(RetTy, VF); + Type *VectorTy = getClampedVectorTy(I, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -4630,15 +4826,32 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + Type *SrcTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcTy, VF); + unsigned Opcode = I->getOpcode(); // We optimize the truncation of induction variable. // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && + if (Opcode == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy); + else if (Opcode == Instruction::Trunc) { + // First, check whether the truncation destination size would be a legal + // vector type. + if (TTI.isTypeLegal(VectorTy)) { + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + unsigned ScalarWidth = DL.getTypeSizeInBits(RetTy); + + if (isNarrowInstruction(cast(I->getOperand(0)), VF, + VectorTy, ScalarWidth)) { + ClampedInstrTys[VF].insert(std::make_pair(I, VectorTy)); + return 0; + } + } + } + else if (isFreeCast(I, VF)) + return 0; - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy); } case Instruction::Call: { bool NeedToScalarize; @@ -4749,7 +4962,7 @@ bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : - UndefValue::get(Instr->getType()); + UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,199 @@ +; RUN: opt -S < %s -basicaa -loop-vectorize -simplifycfg -instsimplify -instcombine -licm 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: load <16 x i8>, <16 x i8>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: load <8 x i16>, <8 x i16>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_d( +; COST: cost of 2 for instruction: {{.*}} load <4 x i16> +; CHECK: load <4 x i16> +; CHECK: load <4 x i16> +; COST: cost of 1 for instruction: {{.*}} add <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; COST: cost of 2 for instruction: {{.*}} store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_e( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <16 x i8> +; CHECK: load <16 x i8> +; COST: cost of 2 for instruction: {{.*}} shl <16 x i8> +; CHECK: shl <16 x i8> +; CHECK: shl <16 x i8> +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; COST: cost of 1 for instruction: {{.*}} or <16 x i8> +; CHECK: or <16 x i8> +; CHECK: or <16 x i8> +; COST: cost of 1 for instruction: {{.*}} mul <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: mul <16 x i8> +; COST: cost of 1 for instruction: {{.*}} and <16 x i8> +; CHECK: and <16 x i8> +; CHECK: and <16 x i8> +; COST: cost of 1 for instruction: {{.*}} xor <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: xor <16 x i8> +; COST: cost of 1 for instruction: {{.*}} mul <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: mul <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = shl nuw nsw i32 %conv, 4 + %conv2 = add nuw nsw i32 %add, 32 + %or = or i32 %conv, 51 + %mul = mul nuw nsw i32 %or, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } + +