diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6960,40 +6960,10 @@ Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto hasSingleCopyAfterVectorization = [this](Instruction *I, - ElementCount VF) -> bool { - if (VF.isScalar()) - return true; - - auto Scalarized = InstsToScalarize.find(VF); - assert(Scalarized != InstsToScalarize.end() && - "VF not yet analyzed for scalarization profitability"); - return !Scalarized->second.count(I) && - llvm::all_of(I->users(), [&](User *U) { - auto *UI = cast(U); - return !Scalarized->second.count(UI); - }); - }; - (void) hasSingleCopyAfterVectorization; - - if (isScalarAfterVectorization(I, VF)) { - // With the exception of GEPs and PHIs, after scalarization there should - // only be one copy of the instruction generated in the loop. This is - // because the VF is either 1, or any instructions that need scalarizing - // have already been dealt with by the the time we get here. As a result, - // it means we don't have to multiply the instruction cost by VF. - assert(I->getOpcode() == Instruction::GetElementPtr || - I->getOpcode() == Instruction::PHI || - (I->getOpcode() == Instruction::BitCast && - I->getType()->isPointerTy()) || - hasSingleCopyAfterVectorization(I, VF)); - VectorTy = RetTy; - } else - VectorTy = ToVectorTy(RetTy, VF); - // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -7120,15 +7090,20 @@ Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, + Op2VP, Operands, I); } case Instruction::FNeg: { - return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None, I->getOperand(0), I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None, + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7188,10 +7163,6 @@ VectorTy = ToVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); } - case Instruction::BitCast: - if (I->getType()->isPointerTy()) - return 0; - LLVM_FALLTHROUGH; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -7202,7 +7173,8 @@ case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: - case Instruction::FPTrunc: { + case Instruction::FPTrunc: + case Instruction::BitCast: { // Computes the CastContextHint from a Load/Store instruction. auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { assert((isa(I) || isa(I)) && @@ -7278,7 +7250,14 @@ } } - return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); + unsigned N; + if (isScalarAfterVectorization(I, VF)) { + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + N = VF.getKnownMinValue(); + } else + N = 1; + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) @@ -7302,8 +7281,11 @@ return InstructionCost::getInvalid(); LLVM_FALLTHROUGH; default: - // This opcode is unknown. Assume that it is the same as 'mul'. - return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + // The cost of executing VF copies of the scalar instruction. This opcode + // is unknown. Assume that it is the same as 'mul'. + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + + getScalarizationOverhead(I, VF); } // end of switch. } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: all_scalar ; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 ; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions ; define void @all_scalar(i64* %a, i64 %n) { diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll @@ -1,13 +1,44 @@ ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s -; REQUIRES: asserts -; XFAIL: * - target triple = "x86_64-apple-macosx" -; CHECK: vector.body - define void @test_pr55096(i64 %c, ptr %p) { +; CHECK-LABEL: @test_pr55096( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %pred.store.continue3 ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %pred.store.continue3 ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i16 [[TMP0]], 2008 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 6229, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <2 x i64> [[VEC_IND]], {{.*}} +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label %pred.store.if, label %pred.store.continue +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i16 [[TMP5]], 2008 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i16 4943, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i16 [[TMP7]] +; CHECK-NEXT: store i16 0, ptr [[TMP8]], align 2 +; CHECK-NEXT: br label %pred.store.continue +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP9:%.*]] = phi i16 [ poison, %vector.body ], [ [[TMP7]], %pred.store.if ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %pred.store.if2, label %pred.store.continue3 +; CHECK: pred.store.if2: +; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[OFFSET_IDX]], 2008 +; CHECK-NEXT: [[TMP12:%.*]] = add i16 [[TMP11]], 2008 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i16 4943, [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP13]] +; CHECK-NEXT: store i16 0, ptr [[TMP14]], align 2 +; CHECK-NEXT: br label %pred.store.continue3 +; CHECK: pred.store.continue3: +; CHECK-NEXT: [[TMP15:%.*]] = phi i16 [ poison, %pred.store.continue ], [ [[TMP13]], %pred.store.if2 ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 340 +; CHECK-NEXT: br i1 [[TMP16]], label %middle.block, label %vector.body +; entry: br label %loop.header