Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -83,6 +83,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/BranchProbability.h" @@ -887,6 +888,12 @@ /// the factor width. unsigned expectedCost(unsigned VF); + // Return the vectorized type or a clamped type that may have been + // previously identified. + Type* getClampedVectorTy(Instruction *I, unsigned VF); + + bool isFreeCast(Instruction *I, unsigned VF); + /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); @@ -921,6 +928,12 @@ const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; + + // While searching from truncs, we store instructions which can use + // smaller types when vectorized. + ValueMap ClampedInstrTys; + // Sext and Zext instructions that will disappear due to vectorization. + SmallPtrSet FreeCasts; }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -4364,7 +4377,6 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; - // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -4372,16 +4384,17 @@ BasicBlock *BB = *bb; // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + for (BasicBlock::reverse_iterator it = BB->rbegin(), e = BB->rend(); + it != e; ++it) { // Skip dbg intrinsics. - if (isa(it)) + if (isa(*it)) continue; // Ignore ephemeral values. - if (EphValues.count(it)) + if (EphValues.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4401,6 +4414,10 @@ Cost += BlockCost; } + // Clear the structures for this VF. + ClampedInstrTys.clear(); + FreeCasts.clear(); + return Cost; } @@ -4463,6 +4480,88 @@ return false; } +bool +LoopVectorizationCostModel::isFreeCast(Instruction *I, unsigned VF) { + CastInst *CI = cast(I); + Type *DstTy = CI->getDestTy(); + Type *DstVecTy = ToVectorTy(DstTy, VF); + + // The search starts from truncs towards the possible extend instructions, + // so these would of been found eariler. + if (I->getOpcode() != Instruction::Trunc) { + if (FreeCasts.count(I)) + return true; + else + return false; + } + + // Should only be used to check integer operations. + if (!DstTy->isIntegerTy()) + return false; + + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + unsigned ScalarWidth = DL.getTypeSizeInBits(DstTy); + + // First, check whether the truncation destination size would be a legal + // vector type. + if (TTI.isTypeLegal(DstVecTy)) { + auto *ChainInst = cast(I->getOperand(0)); + SmallVector PossibleFreeCasts; + unsigned NarrowOperands = 0; + + // Then search the operands of the trunc use and look for sext/zext or + // constants within the size limit. + for (unsigned i = 0; i < ChainInst->getNumOperands(); ++i) { + + if (isa(ChainInst->getOperand(i))) { + auto *ConstInt = cast(ChainInst->getOperand(i)); + const APInt MaxScalarValue = APInt::getMaxValue(ScalarWidth); + + if (ScalarWidth == ConstInt->getValue().getBitWidth()) + if (MaxScalarValue.ult(ConstInt->getValue())) + break; + + ++NarrowOperands; + + } else if (isa(ChainInst->getOperand(i))) { + auto *Cast = cast(ChainInst->getOperand(i)); + + if (!Cast->isIntegerCast()) + break; + + ++NarrowOperands; + + // Check whether the original size of the sext/zext is the same + // as the trunc destination size. + if (Cast->getSrcTy() == DstTy) + PossibleFreeCasts.push_back(Cast); + else + // If not, we still save the clamped size so later the + // zext/sext instruction knows what size is necessary. + ClampedInstrTys.insert(std::make_pair(Cast, DstVecTy)); + } + } + // If all the operands to this instruction are narrow, make the trunc + // and the casts free and assign a clamped VectorTy to the ChainInst. + if (NarrowOperands == ChainInst->getNumOperands()) { + for (auto *I : PossibleFreeCasts) + FreeCasts.insert(I); + ClampedInstrTys.insert(std::make_pair(ChainInst, DstVecTy)); + return true; + } + } + + return false; +} + +Type* +LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) { + if (ClampedInstrTys.find(I) != ClampedInstrTys.end()) + return ClampedInstrTys[I]; + else + return ToVectorTy(I->getType(), VF); +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4471,7 +4570,7 @@ VF = 1; Type *RetTy = I->getType(); - Type *VectorTy = ToVectorTy(RetTy, VF); + Type *VectorTy = getClampedVectorTy(I, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -4630,15 +4729,19 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + Type *SrcTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcTy, VF); + unsigned Opcode = I->getOpcode(); // We optimize the truncation of induction variable. // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && + if (Opcode == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy); - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + if (isFreeCast(I, VF)) + return 0; + else + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy); } case Instruction::Call: { bool NeedToScalarize; Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,133 @@ +; RUN: opt -S < %s -loop-vectorize -simplifycfg -instsimplify -instcombine 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: load <16 x i8>, <16 x i8>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: load <8 x i16>, <8 x i16>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nounwind +; CHECK-LABEL: @add_d( +; COST: cost of 2 for instruction: {{.*}} load <4 x i16> +; CHECK: load <4 x i16> +; CHECK: load <4 x i16> +; COST: cost of 1 for instruction: {{.*}} add <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; COST: cost of 2 for instruction: {{.*}} store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}