diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -173,9 +173,10 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + unsigned Index); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1976,6 +1976,40 @@ return ST->getVectorInsertExtractBaseCost(); } +InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, + Type *Val, unsigned Index) { + InstructionCost cost = this->getVectorInstrCost(I.getOpcode(), Val, Index); + + auto IsExtractedElementUsedAsInteger = + [Val](const Instruction *const Inst) -> bool { + if (!isa_and_nonnull(Inst) || + !Val->getScalarType()->isIntegerTy()) + return false; + + // According to NEON programmer guide, other than multiply instructions, + // instructions that access scalars can access any element in the register + // file. + // + // The cost of extracting a scalar element from a vector register depends + // on how scalar will be used: + // 1. If users could use scalars in vector registers directly, the + // extract-element operation is essentially free. + // 2. If the user instruction requires core register as operand (i.e., + // cannot use scalars in vector register), an explicit move operation will + // be codegen'd. + + // FIXME: + // Do more accurate cost estimation by analyzing the uses of instruction. + return !Inst->use_empty(); + }; + + // 'cost' might be an optimistic 0 when lane is 0. + // Returns the base cost if we know an explicit move is needed. + return IsExtractedElementUsedAsInteger(&I) + ? ST->getVectorInsertExtractBaseCost() + : cost; +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, diff --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll --- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll +++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll @@ -21,26 +21,22 @@ ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 - ret void } ; CHECK-LABEL: vectorInstrExtractCost define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) { - - ; Vector extracts - extracting each element at index 0 is considered - ; free in the current implementation. When extracting element at index - ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as - ; well. - ; ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1 - ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2 + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 2 %t1 = extractelement <4 x i64> %vecreg, i32 1 %t2 = extractelement <4 x i64> %vecreg, i32 2 %ele = add i64 %t2, 1 %cond = icmp eq i64 %t1, %ele - ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0 + ; Vector extracts - extracting each element should have a cost + ; if they are used as integers. + ; + ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 0 ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3 %t0 = extractelement <4 x i64> %vecreg, i32 0 %t3 = extractelement <4 x i64> %vecreg, i32 3 diff --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll --- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll +++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll @@ -18,24 +18,23 @@ ; CHECK-NEXT: [[TMP12]] = add i64 [[TMP4]], 1 ; CHECK-NEXT: br label [[TMP3]] ; CHECK: .split.loop.exit: -; CHECK-NEXT: [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ] ; CHECK-NEXT: [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ] ; CHECK-NEXT: [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]] -; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]] +; CHECK-NEXT: br label [[TMP16:%.*]] ; CHECK: .split.loop.exit2: ; CHECK-NEXT: [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ] ; CHECK-NEXT: [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ] -; CHECK-NEXT: br label [[TMP17]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ] -; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP18]], true -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]] -; CHECK-NEXT: ret i1 [[TMP21]] +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ] +; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]] +; CHECK-NEXT: ret i1 [[TMP20]] ; br label %3