Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -363,13 +363,19 @@ unsigned Width = LT.second.getVectorNumElements(); Index = Index % Width; - // The element at index zero is already inside the vector. - if (Index == 0) + // Floating-point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) return 0; } - // All other insert/extracts cost this much. - return ST->getVectorInsertExtractBaseCost(); + // For all other cross-class inserts/extracts, return the cost specified by + // the sub-target. + if (!Val->getScalarType()->isFloatingPointTy()) + return ST->getVectorInsertExtractBaseCost(); + + // Fall back to the base TTI implementation for floating-point + // inserts/extracts. + return BaseT::getVectorInstrCost(Opcode, Val, Index); } int AArch64TTIImpl::getArithmeticInstrCost( Index: test/Analysis/CostModel/AArch64/bswap.ll =================================================================== --- test/Analysis/CostModel/AArch64/bswap.ll +++ test/Analysis/CostModel/AArch64/bswap.ll @@ -36,35 +36,35 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'bswap_v2i32': -; CHECK: Found an estimated cost of 8 for instruction: %bswap +; CHECK: Found an estimated cost of 14 for instruction: %bswap %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) ret <2 x i32> %bswap } define <4 x i16> @bswap_v4i16(<4 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'bswap_v4i16': -; CHECK: Found an estimated cost of 22 for instruction: %bswap +; CHECK: Found an estimated cost of 28 for instruction: %bswap %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %a) ret <4 x i16> %bswap } define <2 x i64> @bswap_v2i64(<2 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'bswap_v2i64': -; CHECK: Found an estimated cost of 8 for instruction: %bswap +; CHECK: Found an estimated cost of 14 for instruction: %bswap %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a) ret <2 x i64> %bswap } define <4 x i32> @bswap_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'bswap_v4i32': -; CHECK: Found an estimated cost of 22 for instruction: %bswap +; CHECK: Found an estimated cost of 28 for instruction: %bswap %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a) ret <4 x i32> %bswap } define <8 x i16> @bswap_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'bswap_v8i16': -; CHECK: Found an estimated cost of 50 for instruction: %bswap +; CHECK: Found an estimated cost of 56 for instruction: %bswap %bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a) ret <8 x i16> %bswap } Index: test/Analysis/CostModel/AArch64/falkor.ll =================================================================== --- test/Analysis/CostModel/AArch64/falkor.ll +++ test/Analysis/CostModel/AArch64/falkor.ll @@ -9,7 +9,7 @@ ; Vector extracts - extracting the first element should have a zero cost; ; all other elements should have a cost of two. ; - ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0 + ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0 ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1 %t1 = extractelement <2 x i64> undef, i32 0 %t2 = extractelement <2 x i64> undef, i32 1 @@ -17,7 +17,7 @@ ; Vector inserts - inserting the first element should have a zero cost; all ; other elements should have a cost of two. ; - ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 + ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 Index: test/Analysis/CostModel/AArch64/inserts-extracts.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AArch64/inserts-extracts.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -cost-model -analyze | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define void @floating_point() { + ; CHECK-LABEL: floating_point + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = extractelement <4 x double> undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp1 = extractelement <4 x double> undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp2 = extractelement <4 x double> undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp3 = extractelement <4 x double> undef, i32 3 + %tmp0 = extractelement <4 x double> undef, i32 0 + %tmp1 = extractelement <4 x double> undef, i32 1 + %tmp2 = extractelement <4 x double> undef, i32 2 + %tmp3 = extractelement <4 x double> undef, i32 3 + + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp4 = insertelement <4 x double> undef, double undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp5 = insertelement <4 x double> undef, double undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp6 = insertelement <4 x double> undef, double undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp7 = insertelement <4 x double> undef, double undef, i32 3 + %tmp4 = insertelement <4 x double> undef, double undef, i32 0 + %tmp5 = insertelement <4 x double> undef, double undef, i32 1 + %tmp6 = insertelement <4 x double> undef, double undef, i32 2 + %tmp7 = insertelement <4 x double> undef, double undef, i32 3 + ret void +} + +define void @integer() { + ; CHECK-LABEL: integer + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = extractelement <4 x i64> undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = extractelement <4 x i64> undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp2 = extractelement <4 x i64> undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp3 = extractelement <4 x i64> undef, i32 3 + %tmp0 = extractelement <4 x i64> undef, i32 0 + %tmp1 = extractelement <4 x i64> undef, i32 1 + %tmp2 = extractelement <4 x i64> undef, i32 2 + %tmp3 = extractelement <4 x i64> undef, i32 3 + + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp4 = insertelement <4 x i64> undef, i64 undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp5 = insertelement <4 x i64> undef, i64 undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp6 = insertelement <4 x i64> undef, i64 undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp7 = insertelement <4 x i64> undef, i64 undef, i32 3 + %tmp4 = insertelement <4 x i64> undef, i64 undef, i32 0 + %tmp5 = insertelement <4 x i64> undef, i64 undef, i32 1 + %tmp6 = insertelement <4 x i64> undef, i64 undef, i32 2 + %tmp7 = insertelement <4 x i64> undef, i64 undef, i32 3 + ret void +} + +define void @pointer() { + ; CHECK-LABEL: pointer + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = extractelement <4 x i8*> undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = extractelement <4 x i8*> undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp2 = extractelement <4 x i8*> undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp3 = extractelement <4 x i8*> undef, i32 3 + %tmp0 = extractelement <4 x i8*> undef, i32 0 + %tmp1 = extractelement <4 x i8*> undef, i32 1 + %tmp2 = extractelement <4 x i8*> undef, i32 2 + %tmp3 = extractelement <4 x i8*> undef, i32 3 + + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp4 = insertelement <4 x i8*> undef, i8* undef, i32 0 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp5 = insertelement <4 x i8*> undef, i8* undef, i32 1 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp6 = insertelement <4 x i8*> undef, i8* undef, i32 2 + ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp7 = insertelement <4 x i8*> undef, i8* undef, i32 3 + %tmp4 = insertelement <4 x i8*> undef, i8* undef, i32 0 + %tmp5 = insertelement <4 x i8*> undef, i8* undef, i32 1 + %tmp6 = insertelement <4 x i8*> undef, i8* undef, i32 2 + %tmp7 = insertelement <4 x i8*> undef, i8* undef, i32 3 + ret void +} Index: test/Analysis/CostModel/AArch64/kryo.ll =================================================================== --- test/Analysis/CostModel/AArch64/kryo.ll +++ test/Analysis/CostModel/AArch64/kryo.ll @@ -9,7 +9,7 @@ ; Vector extracts - extracting the first element should have a zero cost; ; all other elements should have a cost of two. ; - ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0 + ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0 ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1 %t1 = extractelement <2 x i64> undef, i32 0 %t2 = extractelement <2 x i64> undef, i32 1 @@ -17,7 +17,7 @@ ; Vector inserts - inserting the first element should have a zero cost; all ; other elements should have a cost of two. ; - ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 + ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 Index: test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -12,7 +12,7 @@ ; %tmp4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; COST: LV: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 ; ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK: vector.body: Index: test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -11,7 +11,7 @@ %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 32 for VF 2 For instruction: {{.*}} load i8 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK: vector.body ; CHECK: load i8 Index: test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -168,10 +168,10 @@ ; gaps. ; ; VF_2-LABEL: Checking a loop in "i64_factor_8" -; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -16,9 +16,9 @@ ; as: ; ; Cost of udiv: -; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (udiv(2) + extractelement(12) + insertelement(6)) / 2 = 10 ; -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 10 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { @@ -57,9 +57,9 @@ ; as: ; ; Cost of store: -; (store(4) + extractelement(3)) / 2 = 3 +; (store(4) + extractelement(6)) / 2 = 5 ; -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { @@ -94,12 +94,12 @@ ; compute the cost as: ; ; Cost of add: -; (add(2) + extractelement(3)) / 2 = 2 +; (add(2) + extractelement(6)) / 2 = 4 ; Cost of udiv: -; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 +; (udiv(2) + extractelement(6) + insertelement(6)) / 2 = 7 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; @@ -139,11 +139,11 @@ ; compute the cost as: ; ; Cost of add: -; (add(2) + extractelement(3)) / 2 = 2 +; (add(2) + extractelement(6)) / 2 = 4 ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 @@ -184,18 +184,18 @@ ; Cost of add: ; add(1) = 1 ; Cost of sdiv: -; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (sdiv(2) + extractelement(12) + insertelement(6)) / 2 = 10 ; Cost of udiv: -; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (udiv(2) + extractelement(12) + insertelement(6)) / 2 = 10 ; Cost of sub: -; (sub(2) + extractelement(3)) / 2 = 2 +; (sub(2) + extractelement(6)) / 2 = 4 ; Cost of store: ; store(4) / 2 = 2 ; ; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 10 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 10 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x ; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT -; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER +; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-37 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER ; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" Index: test/Transforms/SLPVectorizer/AArch64/getelementptr.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -slp-threshold=-23 -dce -instcombine < %s | FileCheck %s target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" Index: test/Transforms/SLPVectorizer/AArch64/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,4 +1,4 @@ -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-11 -S < %s | FileCheck %s ; FIXME: The threshold is changed to keep this test case a bit smaller. ; The AArch64 cost model should not give such high costs to select statements. Index: test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll +++ test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s +; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-5 -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu"