Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -263,12 +263,13 @@ public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned VecWidth, + const TargetTransformInfo *TTI, + LoopVectorizationCostModel *CostModel, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + Legal(nullptr), AddedSafetyChecks(false), CM(CostModel) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -328,7 +329,8 @@ VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); + void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV, + LoopVectorizationCostModel *CM, unsigned VF); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and @@ -472,14 +474,19 @@ // Record whether runtime check is added. bool AddedSafetyChecks; + + LoopVectorizationCostModel *CM; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + const TargetTransformInfo *TTI, + LoopVectorizationCostModel *CostModel, + unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, CostModel, 1, + UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -1136,6 +1143,11 @@ /// \return information about the register usage of the loop. RegisterUsage calculateRegisterUsage(); + /// \return A map of instructions that can be used with smaller types. + std::map& getNarrowInstrs(unsigned VF) { + return NarrowInstrs[VF]; + } + private: /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different @@ -1143,6 +1155,34 @@ /// the factor width. unsigned expectedCost(unsigned VF); + /// Return the vectorized type or a clamped type that may have been + /// previously identified. + Type* getClampedVectorTy(Instruction *I, unsigned VF); + + /// Expects a CastInst, returns the destination type that may have been + /// calculated to be smaller than before. The cast is free if the function + /// returns a nullptr. + Type* getAdjustedCastType(Instruction *I, unsigned VF); + + /// Create a pair from I and NarrowTy and insert it into the NarrowInstrs + /// map for VF. If a value already exists, the larger NarrowTy remains or is + /// inserted in its place. + void InsertConfirmedNarrow(Instruction *I, Type *NarrowTy, unsigned VF); + + /// Create a pair from I and NarrowTy and insert it into the CandidateNarrowInstrs + /// map for VF. If a value already exists, the larger NarrowTy remains or is + /// inserted in its place. Returns false if VectorType::get(NarrowTy, VF) is + /// not a legal type. + bool TryInsertCandidateNarrow(Instruction *I, Type *NarrowTy, unsigned VF); + + /// Adds I to a map with a smaller type if it is all that it needs. This is + /// later confirmed. + void MapNarrowInstruction(Instruction *I, unsigned VF); + + /// Use the map of CandidateNarrowInstrs to confirm the usability of smaller types + /// for chains of instructions. + bool ConfirmNarrowChain(Instruction *I, unsigned VF, Type *NarrowTy); + /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); @@ -1177,6 +1217,11 @@ const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; + + // While searching from truncs, we store instructions which can use + // smaller types when vectorized. + std::map> NarrowInstrs; + std::map> CandidateNarrowInstrs; }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1671,11 +1716,11 @@ Twine("interleaved by " + Twine(IC) + " (vectorization not beneficial)")); - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, &CM, IC); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, &CM, VF.Width, IC); LB.vectorize(&LVL); ++LoopsVectorized; @@ -2339,7 +2384,7 @@ bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : - UndefValue::get(VectorType::get(Instr->getType(), VF)); + UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); @@ -3048,8 +3093,9 @@ // Vectorize all of the blocks in the original loop. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), - be = DFS.endRPO(); bb != be; ++bb) - vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + be = DFS.endRPO(); bb != be; ++bb) { + vectorizeBlockInLoop(*bb, &RdxPHIsToFix, CM, VF); + } // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes @@ -3455,7 +3501,10 @@ } } -void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { +void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV, + LoopVectorizationCostModel *CM, + unsigned VF) { + auto &ClampedVecTys = CM->getNarrowInstrs(VF); // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); @@ -3496,11 +3545,32 @@ // Use this vector value for all users of the original instruction. for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + Value *APart = A[Part]; + Value *BPart = B[Part]; + + // The CostModel may have identified operations that could be executed + // using smaller types, so convert those operations to use them. We do + // this by truncating the operands to the binary operations that are + // part of a chain of operations that can use smaller types. + if (ClampedVecTys.count(it)) { + assert(it->getType()->isIntegerTy() && + "Should not be clamping floats!"); + + Type *Ty = ClampedVecTys[it]->getScalarType(); + VectorType *VecTy = VectorType::get(Ty, VF); + APart = Builder.CreateTrunc(APart, VecTy); + BPart = Builder.CreateTrunc(BPart, VecTy); + } + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), APart, BPart); if (BinaryOperator *VecOp = dyn_cast(V)) VecOp->copyIRFlags(BinOp); + // If the instruction has had its inputs clamped, we need to then + // extend the value back to its original size. + if (ClampedVecTys.count(it)) + V = Builder.CreateZExt(V, VectorType::get(it->getType(), VF)); + Entry[Part] = V; } @@ -4888,23 +4958,41 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; + NarrowInstrs.insert(std::make_pair(VF, std::map())); + // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { unsigned BlockCost = 0; BasicBlock *BB = *bb; - // For each instruction in the old loop. + // For each instruction in the old loop, scan across them to identify + // instructions that could use more narrow types. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // Skip dbg intrinsics. - if (isa(it)) + if (isa(*it)) continue; // Ignore ephemeral values. - if (EphValues.count(it)) + if (EphValues.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + MapNarrowInstruction(it, VF); + } + + // For each instruction in the old loop. Iterate in reverse order so that + // so that narrow typed instructions chains can be confirmed from searching + // bottom-up from trunc instructions. + for (auto it = BB->rbegin(), e = BB->rend(); it != e; ++it) { + // Skip dbg intrinsics. + if (isa(*it)) + continue; + + // Ignore ephemeral values. + if (EphValues.count(&*it)) + continue; + + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4986,6 +5074,246 @@ return false; } +// The search for free casts and smaller types begins from truncs towards the +// possible extend instructions and during the search, the extend instructions +// are saved. +Type* LoopVectorizationCostModel::getAdjustedCastType(Instruction *I, + unsigned VF) { + if (NarrowInstrs[VF].count(I)) { + Type *NarrowTy = nullptr; + if (CastInst *CI = dyn_cast(I)) { + if (TruncInst *TI = dyn_cast(I)) + NarrowTy = TI->getDestTy(); + else + NarrowTy = CI->getSrcTy(); + + if (NarrowTy == NarrowInstrs[VF][I]) { + DEBUG(dbgs() << "LV: This cast is free: " << CI->getName() << "\n"); + return nullptr; + } + else + return NarrowInstrs[VF][I]; + } + } + return I->getType(); +} + +Type *getLargestType(Type *T0, Type *T1) { + if (T0->isIntegerTy()) { + if (T1->isIntegerTy()) + return T0->getIntegerBitWidth() > T1->getIntegerBitWidth() ? T0 : T1; + else + return T0; + } else if (T1->isIntegerTy()) + return T1; + + llvm_unreachable("one type should be integer!"); + return T0; +} + +bool LoopVectorizationCostModel::TryInsertCandidateNarrow(Instruction *I, + Type *NarrowTy, + unsigned VF) { + // We do not allow illegal types to be inserted. + if (TTI.isTypeLegal(VectorType::get(NarrowTy, VF))) { + if (CandidateNarrowInstrs[VF].count(I)) { + Type *Largest = getLargestType(CandidateNarrowInstrs[VF][I], NarrowTy); + if (Largest == NarrowTy) { + CandidateNarrowInstrs[VF].erase(I); + CandidateNarrowInstrs[VF].insert(std::make_pair(I, NarrowTy)); + } + } else + CandidateNarrowInstrs[VF].insert(std::make_pair(I, NarrowTy)); + return true; + } + return false; +} + +void LoopVectorizationCostModel::InsertConfirmedNarrow(Instruction *I, Type *NarrowTy, + unsigned VF) { + if (NarrowInstrs[VF].count(I)) { + Type *Largest = getLargestType(NarrowInstrs[VF][I], NarrowTy); + if (Largest == NarrowTy) { + NarrowInstrs[VF].erase(I); + NarrowInstrs[VF].insert(std::make_pair(I, NarrowTy)); + } + } else + NarrowInstrs[VF].insert(std::make_pair(I, NarrowTy)); +} + +Type* +LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) { + if (NarrowInstrs[VF].count(I)) + return ToVectorTy(NarrowInstrs[VF][I], VF); + else + return ToVectorTy(I->getType(), VF); +} + +bool LoopVectorizationCostModel::ConfirmNarrowChain(Instruction *I, + unsigned VF, + Type *NarrowTy) { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + // If a narrow type has already been suggested for this instruction, update + // NarrowTy to be the larger of these types. + if (CandidateNarrowInstrs[VF].count(I)) + NarrowTy = getLargestType(NarrowTy, CandidateNarrowInstrs[VF][I]); + + // Extend instructions now use NarrowTy, this would have either been deduced + // in the first top-down phase, or it has now be recalculated by the value + // required by its single user. + if (CastInst *CI = dyn_cast(I)) { + if (!isa(CI)) { + InsertConfirmedNarrow(CI, NarrowTy, VF); + return true; + } + return false; + } + + // Right shifts can be allowed if the size reduction does not prevent the + // necessary high bits from being calculated. This is true when the shift + // value is less or equal to the suggested narrow type - this narrow type + // is then doubled in size so not to lose the calculated high bits. + if (I->getOpcode() == Instruction::LShr || + I->getOpcode() == Instruction::AShr) { + unsigned ShiftVal = cast(I->getOperand(1))->getZExtValue(); + if (ShiftVal <= 8 && NarrowTy == Type::getInt8Ty(Context)) + NarrowTy = Type::getInt16Ty(Context); + else if (ShiftVal <= 16 && I->getType() == Type::getInt64Ty(Context) && + NarrowTy == Type::getInt16Ty(Context)) + NarrowTy = Type::getInt32Ty(Context); + else + return false; + } + + // To confirm instruction I: + // For all the operands, Opr + // - if its confirmed, continue + // - if its TBC, perform confirmation of Opr + // - if its a cast, the type may be adjusted and needs to be saved + // - if its a constant - continue + // - if not any of the above, this isn't a narrow inst. + unsigned NumConfirmed = 0; + for (Value *Opr : I->operands()) { + if (Instruction *NextOp = dyn_cast(Opr)) { + // If we find an already confirmed operand, grab its value. + if (NarrowInstrs[VF].count(NextOp)) + ++NumConfirmed; + else if (CandidateNarrowInstrs[VF].count(NextOp)) { + // Need to confirm the type of this operand + if (ConfirmNarrowChain(NextOp, VF, NarrowTy)) + ++NumConfirmed; + else + break; + } else if (isa(NextOp)) { + // Visit any new CastInsts using this narrow type. + if (ConfirmNarrowChain(NextOp, VF, NarrowTy)) + ++NumConfirmed; + } + } else if (isa(Opr)) + // We do not have to check as this has previously done, NarrowTy may + // have changed, but it only grows. + ++NumConfirmed; + } + + if (I->getNumOperands() != NumConfirmed) { + CandidateNarrowInstrs[VF].erase(I); + return false; + } else { + // If a value already exists for I, the larger type will be kept + InsertConfirmedNarrow(I, NarrowTy, VF); + return true; + } +} + +void LoopVectorizationCostModel::MapNarrowInstruction(Instruction *I, + unsigned VF) { + if (isa(I)) + return; + + // We only care about integer operations. + Type *DstTy = I->getType(); + if (!DstTy->isIntegerTy()) + return; + + // The following opcodes have been selected as if the same values will + // be calculated for them even with truncated types. They do not require + // the high bits that will be ultimately removed at the finally truncation. + // This is not true for the shift right operations, but these can be included + // if the shift value is an immediate equal or less than half of the bitwidth + // of the resulting type. The extend and trunc ops are also included as they + // bound the chains of operations. + unsigned Opc = I->getOpcode(); + if (Opc != Instruction::Mul && + Opc != Instruction::Add && + Opc != Instruction::Sub && + Opc != Instruction::And && + Opc != Instruction::Or && + Opc != Instruction::Xor && + Opc != Instruction::Shl && + Opc != Instruction::LShr && + Opc != Instruction::AShr) + return; + + // We can later analyse and validate shift right operations only with an + // immediate shift value. + if (Opc == Instruction::LShr || + Opc == Instruction::AShr) + if (!isa(I->getOperand(1))) + return; + + const APInt i8MaxValue = APInt::getMaxValue(8); + const APInt i16MaxValue = APInt::getMaxValue(16); + const APInt i32MaxValue = APInt::getMaxValue(32); + SmallVector NarrowOpTys; + LLVMContext &Context = TheLoop->getHeader()->getContext(); + + // Search the operands of the instruction and look for operands that have + // already been added, constants within the size limit or sext/zext. + for (Value *Opr : I->operands()) { + if (Instruction *NextOp = dyn_cast(Opr)) { + // If the operand is an instruction, it needs to have already been added + // to CandidateNarrowInstrs or a CastInst that may not be in the loop body. + if (CandidateNarrowInstrs[VF].count(NextOp)) + NarrowOpTys.push_back(CandidateNarrowInstrs[VF][NextOp]); + else if (auto *CI = dyn_cast(NextOp)) { + if (auto *TI = dyn_cast(I)) { + if (TryInsertCandidateNarrow(TI, TI->getDestTy(), VF)) + NarrowOpTys.push_back(TI->getDestTy()); + } else if (TryInsertCandidateNarrow(CI, CI->getSrcTy(), VF)) + NarrowOpTys.push_back(CI->getSrcTy()); + } else + return; + } else if (auto *ConstInt = dyn_cast(Opr)) { + // If the operand is a constant, calculate the smallest type that it + // can be. Isn't counted as a narrow operand if its larger than 16-bits, + // unless the original value is an i64. + unsigned BitWidth = ConstInt->getValue().getBitWidth(); + const APInt ConstVal = ConstInt->getValue(); + APInt MaskValue = ConstVal & (i8MaxValue.zextOrSelf(BitWidth)); + if (ConstVal.eq(MaskValue)) + NarrowOpTys.push_back(Type::getInt8Ty(Context)); + else if (ConstVal.eq(ConstVal & (i16MaxValue.zextOrSelf(BitWidth)))) + NarrowOpTys.push_back(Type::getInt16Ty(Context)); + else if (I->getType() == Type::getInt64Ty(Context) && + ConstVal.eq(ConstVal & (i32MaxValue.zextOrSelf(BitWidth)))) + NarrowOpTys.push_back(Type::getInt32Ty(Context)); + else + return; + } + } + if (NarrowOpTys.size() == I->getNumOperands()) { + // If narrow operands are found, we need to map the new narrow type. This is + // either the larger type of the two operands if their differ in size. + if (NarrowOpTys[0] == NarrowOpTys[1]) { + TryInsertCandidateNarrow(I, NarrowOpTys[0], VF); + } + else { + Type *LargerType = getLargestType(NarrowOpTys[0], NarrowOpTys[1]); + TryInsertCandidateNarrow(I, LargerType, VF); + } + } +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4994,7 +5322,7 @@ VF = 1; Type *RetTy = I->getType(); - Type *VectorTy = ToVectorTy(RetTy, VF); + Type *VectorTy = getClampedVectorTy(I, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -5193,15 +5521,38 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + Type *SrcTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcTy, VF); + unsigned Opcode = I->getOpcode(); // We optimize the truncation of induction variable. // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && + if (Opcode == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy); + else if (Opcode == Instruction::Trunc) { + // First, check whether the truncation destination size would be a legal + // vector type. + if (TTI.isTypeLegal(VectorTy)) { + Instruction *ChainOp = cast(I->getOperand(0)); + if (ConfirmNarrowChain(ChainOp, VF, RetTy)) { + DEBUG(dbgs() << "LV: Found a chain of narrow instructions\n"); + Type *NarrowTy = NarrowInstrs[VF][ChainOp]; + InsertConfirmedNarrow(I, NarrowTy, VF); + } + } + } + // If AdjustedType is nullptr, it is a free cast, otherwise use the adjusted + // type for either the source or destination. + Type *AdjustedType = getAdjustedCastType(I, VF); + if (AdjustedType == nullptr) + return 0; + else if (Instruction::Trunc == Opcode) + SrcTy = ToVectorTy(AdjustedType, VF); + else if (Instruction::ZExt == Opcode || + Instruction::SExt == Opcode) + VectorTy = ToVectorTy(AdjustedType, VF); - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy); } case Instruction::Call: { bool NeedToScalarize; @@ -5312,7 +5663,7 @@ bool IsVoidRetTy = Instr->getType()->isVoidTy(); Value *UndefVec = IsVoidRetTy ? nullptr : - UndefValue::get(Instr->getType()); + UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,298 @@ +; RUN: opt -S < %s -basicaa -loop-vectorize -simplifycfg -instsimplify -instcombine -licm 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; COST: cost of 2 {{.*}} load <16 x i8> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: load <16 x i8>, <16 x i8>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add nuw nsw <16 x i8> +; CHECK: add nuw nsw <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: load <8 x i16>, <8 x i16>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add nuw nsw <8 x i16> +; CHECK: add nuw nsw <8 x i16> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_d( +; COST: cost of 2 for instruction: {{.*}} load <4 x i16> +; CHECK: load <4 x i16> +; CHECK: load <4 x i16> +; COST: cost of 1 for instruction: {{.*}} add <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; COST: cost of 2 for instruction: {{.*}} store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_e( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <16 x i8> +; CHECK: load <16 x i8> +; COST: cost of 2 for instruction: {{.*}} shl <16 x i8> +; CHECK: shl nuw nsw <16 x i8> +; CHECK: shl nuw nsw <16 x i8> +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add nuw nsw <16 x i8> +; CHECK: add nuw nsw <16 x i8> +; COST: cost of 1 for instruction: {{.*}} or <16 x i8> +; CHECK: or <16 x i8> +; CHECK: or <16 x i8> +; COST: cost of 1 for instruction: {{.*}} mul <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; COST: cost of 1 for instruction: {{.*}} and <16 x i8> +; CHECK: and <16 x i8> +; CHECK: and <16 x i8> +; COST: cost of 1 for instruction: {{.*}} xor <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: xor <16 x i8> +; COST: cost of 1 for instruction: {{.*}} mul <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = shl nuw nsw i32 %conv, 4 + %conv2 = add nuw nsw i32 %add, 32 + %or = or i32 %conv, 51 + %mul = mul nuw nsw i32 %or, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; COST: cost of 2 for instruction: {{.*}} load <8 x i16> +; CHECK: load <8 x i16> +; COST: cost of 2 for instruction: {{.*}} shl <8 x i16> +; CHECK: shl nsw <8 x i16> +; COST: cost of 1 for instruction: {{.*}} add <8 x i16> +; CHECK: add nsw <8 x i16> +; COST: cost of 1 for instruction: {{.*}} and <8 x i16> +; CHECK: and <8 x i16> +; COST: cost of 1 for instruction: {{.*}} or <8 x i16> +; CHECK: or <8 x i16> +; COST: cost of 1 for instruction: {{.*}} mul <8 x i16> +; CHECK: mul nuw nsw <8 x i16> +; COST: cost of 1 for instruction: {{.*}} and <8 x i16> +; CHECK: and <8 x i16> +; CHECK: and <8 x i16> +; COST: cost of 1 for instruction: {{.*}} xor <8 x i16> +; CHECK: xor <8 x i16> +; COST: cost of 1 for instruction: {{.*}} mul <8 x i16> +; CHECK: mul nuw nsw <8 x i16> +; COST: cost 28 for instruction: {{.*}} trunc <8 x i16> +; CHECK trunc <8 x i16> +; COST: cost of 2 for instruction: {{.*}} store <8 x i8> +; CHECK: store <8 x i8> +define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = shl nsw i32 %conv, 4 + %conv2 = add nsw i32 %add, 32 + %or = and i32 %conv, 204 + %conv8 = or i32 %or, 51 + %mul = mul nuw nsw i32 %conv8, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK: load <4 x i16> +; CHECK: load <4 x i16> +; CHECK: shl nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add <4 x i32> +; CHECK: and <4 x i32> +; CHECK: lshr <4 x i32> +define void @add_g(i16* noalias nocapture readonly %p, i16* noalias nocapture readonly %q, i16* noalias nocapture %r, i16 %arg1, i32 %len) #0 { + %1 = icmp sgt i32 %len, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = sext i16 %arg1 to i64 + br label %3 + +._crit_edge: ; preds = %3, %0 + ret void + +;