Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -65,8 +65,8 @@ // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse' // indicates whether the vector instruction is available in the input IR or // just imaginary in vectorizer passes. - InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index, - bool HasRealUse); + InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val, + unsigned Index, bool HasRealUse); public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2184,7 +2184,8 @@ return 0; } -InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val, +InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, + Type *Val, unsigned Index, bool HasRealUse) { assert(Val->isVectorTy() && "This must be a vector type"); @@ -2210,14 +2211,21 @@ // needed. So it has non-zero cost. // - For the rest of cases (virtual instruction or element type is float), // consider the instruction free. - // + if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) + return 0; + + // This is recognising a LD1 single-element structure to one lane of one + // register instruction. I.e., if this is an `insertelement` instruction, + // and its second operand is a load, then we will generate a LD1, which + // are expensive instructions. + if (I && dyn_cast(I->getOperand(1))) + return ST->getVectorInsertExtractBaseCost() + 1; + // FIXME: // If the extract-element and insert-element instructions could be // simplified away (e.g., could be combined into users by looking at use-def // context), they have no cost. This is not done in the first place for // compile-time considerations. - if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) - return 0; } // All other insert/extracts cost this much. @@ -2227,12 +2235,13 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, Value *Op0, Value *Op1) { - return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */); + return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val, unsigned Index) { - return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */); + + return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); } InstructionCost AArch64TTIImpl::getArithmeticInstrCost( Index: llvm/test/Analysis/CostModel/AArch64/insert-extract.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/insert-extract.ll +++ llvm/test/Analysis/CostModel/AArch64/insert-extract.ll @@ -108,12 +108,12 @@ define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) { ; KRYO-LABEL: 'LD1_B' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 -; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 +; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 ; ; NEO-LABEL: 'LD1_B' ; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 +; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 ; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 ; entry: @@ -125,12 +125,12 @@ define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) { ; KRYO-LABEL: 'LD1_H' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 -; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 +; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 ; ; NEO-LABEL: 'LD1_H' ; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 +; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 ; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 ; entry: @@ -142,12 +142,12 @@ define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) { ; KRYO-LABEL: 'LD1_W' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 -; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 +; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 ; ; NEO-LABEL: 'LD1_W' ; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 +; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 ; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 ; entry: @@ -159,12 +159,12 @@ define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) { ; KRYO-LABEL: 'LD1_X' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 -; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 +; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 ; ; NEO-LABEL: 'LD1_X' ; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 +; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 ; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 ; entry: