Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1536,11 +1536,18 @@ /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - unsigned expectedCost(unsigned VF); + unsigned expectedCost(unsigned VF, bool &ActuallyVectorized); + + /// Call expectedCost, but ignore the ActuallyVectorized output parameter. + unsigned expectedCost(unsigned VF) { + bool ActuallyVectorized = false; + return expectedCost(VF, ActuallyVectorized); + } /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - unsigned getInstructionCost(Instruction *I, unsigned VF); + unsigned getInstructionCost(Instruction *I, unsigned VF, + bool &ActuallyVectorized); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -5164,9 +5171,15 @@ // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - float VectorCost = expectedCost(i) / (float)i; + bool ActuallyVectorized = false; + float VectorCost = expectedCost(i, ActuallyVectorized) / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); + if (!ActuallyVectorized && !ForceVectorization) { + DEBUG(dbgs() << "LV: Not considering vector loop of width " << i << + " because it will not generate any vector instructions.\n"); + continue; + } if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5541,7 +5554,8 @@ return RUs; } -unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { +unsigned LoopVectorizationCostModel::expectedCost(unsigned VF, + bool &ActuallyVectorized) { unsigned Cost = 0; // For each block. @@ -5560,7 +5574,7 @@ if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(&*it, VF); + unsigned C = getInstructionCost(&*it, VF, ActuallyVectorized); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -5655,7 +5669,8 @@ } unsigned -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, + bool &ActuallyVectorized) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (Legal->isUniformAfterVectorization(I)) @@ -5666,6 +5681,10 @@ RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); + if (VF > 1 && !ActuallyVectorized && !VectorTy->isVoidTy() && + TTI.getNumberOfParts(VectorTy) < VF) + ActuallyVectorized = true; + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: Index: test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll @@ -0,0 +1,62 @@ +; RUN: opt -S -loop-vectorize < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" + +; Function Attrs: nounwind +define zeroext i32 @test() #0 { +; CHECK-LABEL: @test +; CHECK-NOT: x i32> + +entry: + %a = alloca [1600 x i32], align 4 + %c = alloca [1600 x i32], align 4 + %0 = bitcast [1600 x i32]* %a to i8* + call void @llvm.lifetime.start(i64 6400, i8* %0) #3 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %1 = bitcast [1600 x i32]* %c to i8* + call void @llvm.lifetime.start(i64 6400, i8* %1) #3 + %arraydecay = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 0 + %arraydecay1 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 0 + %call = call signext i32 @bar(i32* %arraydecay, i32* %arraydecay1) #3 + br label %for.body6 + +for.body: ; preds = %for.body, %entry + %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.body ] + %arrayidx = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 %indvars.iv25 + %2 = trunc i64 %indvars.iv25 to i32 + store i32 %2, i32* %arrayidx, align 4 + %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1 + %exitcond27 = icmp eq i64 %indvars.iv.next26, 1600 + br i1 %exitcond27, label %for.cond.cleanup, label %for.body + +for.cond.cleanup5: ; preds = %for.body6 + call void @llvm.lifetime.end(i64 6400, i8* nonnull %1) #3 + call void @llvm.lifetime.end(i64 6400, i8* %0) #3 + ret i32 %add + +for.body6: ; preds = %for.body6, %for.cond.cleanup + %indvars.iv = phi i64 [ 0, %for.cond.cleanup ], [ %indvars.iv.next, %for.body6 ] + %s.022 = phi i32 [ 0, %for.cond.cleanup ], [ %add, %for.body6 ] + %arrayidx8 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 %indvars.iv + %3 = load i32, i32* %arrayidx8, align 4 + %add = add i32 %3, %s.022 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.cond.cleanup5, label %for.body6 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +declare signext i32 @bar(i32*, i32*) #2 + +attributes #0 = { nounwind "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" } +attributes #3 = { nounwind } +