Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -412,6 +412,35 @@ return 0; return Cost; + + } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { + if (ST->hasP9Altivec()) { + if (ISD == ISD::INSERT_VECTOR_ELT) + // A move-to VSR and a permute/insert. Assume vector operation cost + // for both (cost will be 2x on P9). + return vectorCostAdjustment(2, Opcode, Val, nullptr); + + // It's an extract. Maybe we can do a cheap move-from VSR. + unsigned EltSize = Val->getScalarSizeInBits(); + if (EltSize == 64) { + unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0; + if (Index == MfvsrdIndex) + return 1; + } else if (EltSize == 32) { + unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1; + if (Index == MfvsrwzIndex) + return 1; + } + + // We need a vector extract (or mfvsrld). Assume vector operation cost. + // The cost of the load constant for a vector extract is disregarded + // (invariant, easily schedulable). + return vectorCostAdjustment(1, Opcode, Val, nullptr); + + } else if (ST->hasDirectMove()) + // Assume permute has standard cost. + // Assume move-to/move-from VSR have 2x standard cost. + return 3; } // Estimated cost of a load-hit-store delay. This was obtained Index: llvm/test/Analysis/CostModel/PowerPC/p9.ll =================================================================== --- llvm/test/Analysis/CostModel/PowerPC/p9.ll +++ llvm/test/Analysis/CostModel/PowerPC/p9.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck --check-prefix=CHECK-P8 %s ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9 %s ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-LE %s @@ -41,15 +42,22 @@ ; CHECK-P9: cost of 2 {{.*}} icmp } -define void @test4xi32(<4 x i32> %arg1, <4 x i32> %arg2, <4 x i32>* %arg3) { +define void @test4xi32(i32 %arg1, <4 x i32>* %arg2) { - %v1 = load <4 x i32>, <4 x i32>* %arg3 - store <4 x i32> %arg2, <4 x i32>* %arg3 + %v1 = load <4 x i32>, <4 x i32>* %arg2 + %v2 = insertelement <4 x i32> %v1, i32 %arg1, i32 2 + store <4 x i32> %v2, <4 x i32>* %arg2 ret void ; CHECK: cost of 1 {{.*}} load + ; CHECK: cost of 10 {{.*}} insertelement ; CHECK: cost of 1 {{.*}} store + ; CHECK-P8-LABEL: test4xi32 + ; CHECK-P8: cost of 1 {{.*}} load + ; CHECK-P8: cost of 3 {{.*}} insertelement + ; CHECK-P8: cost of 1 {{.*}} store ; CHECK-P9: cost of 2 {{.*}} load + ; CHECK-P9: cost of 4 {{.*}} insertelement ; CHECK-P9: cost of 2 {{.*}} store } @@ -66,3 +74,56 @@ ; CHECK-LE: cost of 2 {{.*}} extractelement ; CHECK-LE: cost of 0 {{.*}} extractelement } + +define void @vexti32(<4 x i32> %p1) { + %i1 = extractelement <4 x i32> %p1, i32 0 + %i2 = extractelement <4 x i32> %p1, i32 1 + %i3 = extractelement <4 x i32> %p1, i32 2 + %i4 = extractelement <4 x i32> %p1, i32 3 + ret void + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK-P8-LABEL: vexti32 + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-P9: cost of 1 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 1 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement +} + +define void @vexti64(<2 x i64> %p1) { + %i1 = extractelement <2 x i64> %p1, i32 0 + %i2 = extractelement <2 x i64> %p1, i32 1 + ret void + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P9: cost of 1 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 1 {{.*}} extractelement +} + +define void @vext(<8 x i16> %p1, <16 x i8> %p2) { + %i1 = extractelement <8 x i16> %p1, i32 0 + %i2 = extractelement <16 x i8> %p2, i32 0 + ret void + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P8: cost of 3 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-P9: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement + ; CHECK-LE: cost of 2 {{.*}} extractelement +}