Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -887,6 +887,12 @@ /// the factor width. unsigned expectedCost(unsigned VF); + // Return the vectorized type or a clamped type that may have been + // previously identified. + Type* getClampedVectorTy(Instruction *I, unsigned VF); + + bool isFreeCast(Instruction *I, unsigned VF); + /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); @@ -921,6 +927,12 @@ const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; + + // While searching from truncs, we store instructions which can use + // smaller types when vectorized. + std::map ClampedInstrTys; + // Sext and Zext instructions that will disappear due to vectorization. + std::set FreeCasts; }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -4045,6 +4057,7 @@ } unsigned LoopVectorizationCostModel::getWidestType() { + unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4372,16 +4385,17 @@ BasicBlock *BB = *bb; // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + for (BasicBlock::reverse_iterator it = BB->rbegin(), e = BB->rend(); + it != e; ++it) { // Skip dbg intrinsics. - if (isa(it)) + if (isa(*it)) continue; // Ignore ephemeral values. - if (EphValues.count(it)) + if (EphValues.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4401,6 +4415,10 @@ Cost += BlockCost; } + // Clear the structures for this VF. + ClampedInstrTys.clear(); + FreeCasts.clear(); + return Cost; } @@ -4463,6 +4481,85 @@ return false; } +bool +LoopVectorizationCostModel::isFreeCast(Instruction *I, unsigned VF) { + CastInst *CI = cast(I); + Type *DstTy = CI->getDestTy(); + Type *DstVecTy = ToVectorTy(DstTy, VF); + + // Search happens from truncs towards the possible extend instructions, + // so these would of been found eariler. + if (I->getOpcode() != Instruction::Trunc) { + if (FreeCasts.find(I) != FreeCasts.end()) + return true; + else + return false; + } + + if (!DstTy->isIntegerTy()) + return false; + + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + unsigned ScalarWidth = DL.getTypeSizeInBits(DstTy); + + // First, check whether the truncation destination size would be a legal + // vector type. + if (TTI.isTypeLegal(DstVecTy)) { + auto *ChainInst = cast(I->getOperand(0)); + SmallVector PossibleFreeCasts; + unsigned NarrowOperands = 0; + + // Then search the operands of the trunc use and look for sext/zext or + // constants within the size limit. + for (unsigned i = 0; i < ChainInst->getNumOperands(); ++i) { + + if (isa(ChainInst->getOperand(i))) { + auto *ConstInt = cast(ChainInst->getOperand(i)); + + if (ConstInt->uge((1 << ScalarWidth) - 1)) + break; + + ++NarrowOperands; + + } else if (isa(ChainInst->getOperand(i))) { + auto *Cast = cast(ChainInst->getOperand(i)); + + if (!Cast->isIntegerCast()) + break; + + ++NarrowOperands; + + // Check whether the original size of the sext/zext is the same + // as the trunc destination size. + if (Cast->getSrcTy() == DstTy) + PossibleFreeCasts.push_back(Cast); + else + // If not, we still save the clamped size so later the + // zext/sext instruction knows what size is necessary. + ClampedInstrTys.insert(std::make_pair(Cast, DstVecTy)); + } + } + // If all the operands to this instruction are narrow, make the trunc + // and the casts free and assign a clamped VectorTy to the ChainInst. + if (NarrowOperands == ChainInst->getNumOperands()) { + for (auto *I : PossibleFreeCasts) + FreeCasts.insert(I); + ClampedInstrTys.insert(std::make_pair(ChainInst, DstVecTy)); + return true; + } + } + + return false; +} + +Type* +LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) { + if (ClampedInstrTys.find(I) != ClampedInstrTys.end()) + return ClampedInstrTys[I]; + else + return ToVectorTy(I->getType(), VF); +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4471,7 +4568,7 @@ VF = 1; Type *RetTy = I->getType(); - Type *VectorTy = ToVectorTy(RetTy, VF); + Type *VectorTy = getClampedVectorTy(I, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -4630,15 +4727,19 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + Type *SrcTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcTy, VF); + unsigned Opcode = I->getOpcode(); // We optimize the truncation of induction variable. // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && + if (Opcode == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy); - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + if (isFreeCast(I, VF)) + return 0; + else + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy); } case Instruction::Call: { bool NeedToScalarize; Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,147 @@ +; RUN: opt -S < %s -loop-vectorize -simplifycfg -instsimplify -instcombine 2>&1 | FileCheck %s + +; ModuleID = 'vector-test.c' +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: load <16 x i8>, <16 x i8>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1, !tbaa !1 + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3, align 1, !tbaa !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; COST: cost of 2 for instruction: {{.*}} load <16 x i8> +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: load <8 x i16>, <8 x i16>* +; COST: cost of 1 for instruction: {{.*}} add <16 x i8> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; COST: cost of 2 for instruction: {{.*}} store <16 x i8> +; CHECK: store <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2, !tbaa !4 + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3, align 2, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1, !tbaa !1 + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3, align 2, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nounwind +; CHECK-LABEL: @add_d( +; COST: cost of 2 for instruction: {{.*}} load <4 x i16> +; CHECK: load <4 x i16> +; CHECK: load <4 x i16> +; COST: cost of 1 for instruction: {{.*}} add <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; COST: cost of 2 for instruction: {{.*}} store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2, !tbaa !4 + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4, !tbaa !6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.7.0 (http://llvm.org/git/clang.git 93281cbcbb9b1ea2b6788a629d6aef3284957d05) (http://llvm.org/git/llvm.git 40048e70d7a63ab64df3d0e52107f3c0e3472571)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!5, !5, i64 0} +!5 = !{!"short", !2, i64 0} +!6 = !{!7, !7, i64 0} +!7 = !{!"int", !2, i64 0}