Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -559,6 +559,7 @@ if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); if (Subtarget.hasP8Vector()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); @@ -572,6 +573,7 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -850,6 +850,10 @@ (f64 (EXTRACT_SUBREG $S, sub_64))>; def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; +def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)), + (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), $A, 1))>; +def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)), + (v2f64 (XXPERMDI $A, (COPY_TO_REGCLASS $B, VSRC), 0))>; } let Predicates = [IsLittleEndian] in { @@ -861,6 +865,10 @@ (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; +def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)), + (v2f64 (XXPERMDI $A, (COPY_TO_REGCLASS $B, VSRC), 0))>; +def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)), + (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), $A, 1))>; } // Additional fnmsub patterns: -a*c + b == -(a*c - b) @@ -1657,6 +1665,12 @@ (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; + def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 0)), + (v2i64 (XXPERMDI + (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), $V, 1))>; + def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 1)), + (v2i64 (XXPERMDI + $V, (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), 0))>; def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_15)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), @@ -1764,6 +1778,12 @@ (v4i32 MovesToVSR.LE_WORD_0)>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 MovesToVSR.LE_DWORD_0)>; + def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 0)), + (v2i64 (XXPERMDI + $V, (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), 0))>; + def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 1)), + (v2i64 (XXPERMDI + (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), $V, 1))>; def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_0)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -315,16 +315,11 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); + int DirectMoveCost = 5; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { - // Double-precision scalars are already located in index #0. - if (Index == 0) - return 0; - - return BaseT::getVectorInstrCost(Opcode, Val, Index); - } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { + if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { // Floating point scalars are already located in index #0. if (Index == 0) return 0; @@ -332,6 +327,75 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index); } + // Handle vector insertions and extractions differently because extractions + // are inherrently more efficient on Power hardware prior to Power9. + if (ISD == ISD::EXTRACT_VECTOR_ELT) { + // Doubles are already at index 1/0 (LE/BE) + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { + if (Index == ST->isLittleEndian() ? 1 : 0) + return 0; + else // Otherwise it needs a swap + return 1; + } + + // Integers are moved out of the following indices (LE/BE): + // 8-bit -> 8/7 + // 16-bit -> 3/4 + // 32-bit -> 2/1 + // 64-bit -> 1/0 + auto isAtNaturalIndex = [=](int BitWidth, bool IsLE) -> bool { + if (IsLE) + return (BitWidth == 64 && Index == 1) || + (BitWidth == 32 && Index == 2) || + (BitWidth == 16 && Index == 3) || + (BitWidth = 8 && Index == 8); + else + return (BitWidth == 64 && Index == 0) || + (BitWidth == 32 && Index == 1) || + (BitWidth == 16 && Index == 4) || + (BitWidth == 8 && Index == 7); + return false; + }; + + if (Val->isIntegerTy() && ST->hasDirectMove() && + isAtNaturalIndex(Val->getScalarSizeInBits(), ST->isLittleEndian())) + return DirectMoveCost; + else if (ST->hasDirectMove() && Index != -1U) + // Otherwise they need repositioning in the vector register. + return DirectMoveCost + 1; + } else if (ISD == ISD::INSERT_VECTOR_ELT) { + // Doubles only need a variant of XXPERMDI to be inserted (when the index + // is constant). + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy() && + Index != -1U) + return 1; + + // On Power9 we do not need to permute the vectors when inserting a 32-bit + // value. Floating point gets converted and inserted, integers are moved and + // inserted. + if (ST->hasP9Vector() && Val->getScalarType()->isFloatTy() && + Index != -1U) + return 2; // Convert + insert + if (ST->hasP9Vector() && Val->getScalarType()->isIntegerTy(32) && + Index != -1U) + return DirectMoveCost + 1; // Move + insert + + // 64-bit integers need a direct move and a variant of XXPERMDI to be + // inserted (when the index is constant). + if (ST->hasDirectMove() && Val->getScalarType()->isIntegerTy(64) && + Index != -1U) + return DirectMoveCost + 1; + + // Smaller integers are inserted by moving into the VSR, loading a permute + // mask and permuting the vectors. + if (ST->hasDirectMove() && Val->getScalarType()->isIntegerTy() && + Index != -1U) + return DirectMoveCost + 3; + } else + return BaseT::getVectorInstrCost(Opcode, Val, Index); + + // Other inserts/extracts will incur an LHS penalty. + // Estimated cost of a load-hit-store delay. This was obtained // experimentally as a minimum needed to prevent unprofitable // vectorization for the paq8p benchmark. It may need to be Index: test/Analysis/CostModel/PowerPC/insert_extract.ll =================================================================== --- test/Analysis/CostModel/PowerPC/insert_extract.ll +++ test/Analysis/CostModel/PowerPC/insert_extract.ll @@ -9,7 +9,7 @@ } define i32 @extract(<4 x i32> %arg) { - ; CHECK: cost of 3 {{.*}} extractelement + ; CHECK: cost of 5 {{.*}} extractelement %x = extractelement <4 x i32> %arg, i32 0 ret i32 %x } Index: test/CodeGen/PowerPC/swaps-le-5.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-5.ll +++ test/CodeGen/PowerPC/swaps-le-5.ll @@ -15,11 +15,12 @@ } ; CHECK-LABEL: @bar0 -; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] -; CHECK-DAG: xxspltd [[REG2:[0-9]+]] -; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1 -; CHECK: stxvd2x [[REG3]] +; CHECK-DAG: xxswapd [[REG1:[0-9]+]], 1 +; CHECK-DAG: lxvd2x [[REG2:[0-9]+]] +; CHECK-NOT: xxswapd +; CHECK:xxmrgld [[REG3:[0-9]+]], [[REG1]], [[REG2]] ; CHECK-NOT: xxswapd +; CHECK: stxvd2x [[REG3]] define void @bar1(double %y) { entry: @@ -30,9 +31,9 @@ } ; CHECK-LABEL: @bar1 -; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] -; CHECK-DAG: xxspltd [[REG2:[0-9]+]] -; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]] +; CHECK-DAG: xxswapd [[REG1:[0-9]+]], 1 +; CHECK-DAG: lxvd2x [[REG2:[0-9]+]] +; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1 ; CHECK: stxvd2x [[REG3]] ; CHECK-NOT: xxswapd Index: test/CodeGen/PowerPC/swaps-le-6.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-6.ll +++ test/CodeGen/PowerPC/swaps-le-6.ll @@ -20,8 +20,9 @@ ; CHECK-LABEL: @bar0 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: lxsdx [[REG2:[0-9]+]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1 +; CHECK: xxswapd [[REG4:[0-9]+]], [[REG2]] +; CHECK: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]] +; CHECK-NOT: xxswapd ; CHECK: stxvd2x [[REG5]] define void @bar1() { @@ -36,7 +37,8 @@ ; CHECK-LABEL: @bar1 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: lxsdx [[REG2:[0-9]+]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]] +; CHECK: xxswapd [[REG4:[0-9]+]], [[REG2]] +; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1 +; CHECK-NOT: xxswapd ; CHECK: stxvd2x [[REG5]] Index: test/CodeGen/PowerPC/vsx_insert_extract_le.ll =================================================================== --- test/CodeGen/PowerPC/vsx_insert_extract_le.ll +++ test/CodeGen/PowerPC/vsx_insert_extract_le.ll @@ -10,8 +10,7 @@ ; CHECK: lxvd2x 0, 0, 3 ; CHECK: lxsdx 1, 0, 4 ; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 1, 1, 0 -; CHECK: xxpermdi 34, 0, 1, 1 +; CHECK: xxmrghd 34, 0, 1 } define <2 x double> @testi1(<2 x double>* %p1, double* %p2) { @@ -24,8 +23,7 @@ ; CHECK: lxvd2x 0, 0, 3 ; CHECK: lxsdx 1, 0, 4 ; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 1, 1, 0 -; CHECK: xxmrgld 34, 1, 0 +; CHECK: xxpermdi 34, 1, 0, 1 } define double @teste0(<2 x double>* %p1) {