Index: ../lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.cpp +++ ../lib/Target/X86/X86TargetTransformInfo.cpp @@ -578,6 +578,8 @@ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, @@ -594,9 +596,9 @@ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, @@ -689,14 +691,17 @@ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, // The generic code to compute the scalar overhead is currently broken. // Workaround this limitation by estimating the scalarization overhead // here. We have roughly 10 instructions per scalar element. // Multiply that by the vector width. // FIXME: remove that when PR19268 is fixed. - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 13 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, // This node is expanded into scalarized operations but BasicTTI is overly @@ -706,6 +711,9 @@ // should be factored in too. Inflating the cost per element by 1. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, + + { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, + { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, }; static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { @@ -740,27 +748,34 @@ }; static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { - // These are somewhat magic numbers justified by looking at the output of - // Intel's IACA, running some kernels and making sure when we take - // legalization into account the throughput will be overestimated. + // These numbers reflect the number of generated instructions + // and do not reflect instruction latency { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 15 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 7 }, // There are faster sequences for float conversions. - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 8 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, @@ -791,62 +806,71 @@ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, }; - std::pair LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair LTDest = TLI->getTypeLegalizationCost(DL, Dst); - - if (ST->hasSSE2() && !ST->hasAVX()) { - if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, - LTDest.second, LTSrc.second)) - return LTSrc.first * Entry->Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); - // The function getSimpleVT only handles simple value types. - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + MVT SrcVT; + MVT DstVT; + int ExtraSplitCost = 0; + int SplitFactor = 1; + if (!SrcTy.isVector()) { + // Scalar types + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + SrcVT = SrcTy.getSimpleVT(); + DstVT = DstTy.getSimpleVT(); + } else { + // Vector types + auto legalize = [&](Type *T, EVT OrigVT, MVT& SimpleVT) { + std::pair LT = TLI->getTypeLegalizationCost(DL, T); + int SplitFactor; + std::tie(SplitFactor, SimpleVT) = LT; + if (SplitFactor == 1 && OrigVT.isSimple()) + SimpleVT = OrigVT.getSimpleVT(); + return SplitFactor; + }; - if (ST->hasDQI()) - if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; + int SrcSplitFactor = legalize(Src, SrcTy, SrcVT); + int DstSplitFactor = legalize(Dst, DstTy, DstVT); - if (ST->hasAVX512()) - if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; - - if (ST->hasAVX2()) { - if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; - } - - if (ST->hasAVX()) { - if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; - } - - if (ST->hasSSE41()) { - if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; + SplitFactor = std::max(SrcSplitFactor, DstSplitFactor); + if (SplitFactor > 1) { + auto adaptVT = [](MVT VT, int Split) { + return MVT::getVectorVT(VT.getScalarType(), + VT.getVectorNumElements() / Split); + }; + if (SrcSplitFactor > DstSplitFactor) { + DstVT = adaptVT(DstVT, SrcSplitFactor / DstSplitFactor); + ExtraSplitCost = SrcSplitFactor / DstSplitFactor - 1; + } + else if (DstSplitFactor > SrcSplitFactor) { + // Split SrcVT + SrcVT = adaptVT(SrcVT, DstSplitFactor / SrcSplitFactor); + ExtraSplitCost = DstSplitFactor / SrcSplitFactor - 1; + } + } } - if (ST->hasSSE2()) { - if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; + SmallVector, 8> Tbls; + if (ST->hasDQI()) + Tbls.push_back(AVX512DQConversionTbl); + if (ST->hasAVX512()) + Tbls.push_back(AVX512FConversionTbl); + if (ST->hasAVX2()) + Tbls.push_back(AVX2ConversionTbl); + if (ST->hasAVX()) + Tbls.push_back(AVXConversionTbl); + if (ST->hasSSE41()) + Tbls.push_back(SSE41ConversionTbl); + if (ST->hasSSE2()) + Tbls.push_back(SSE2ConversionTbl); + + for (ArrayRef Tbl : Tbls) { + if (const auto *Entry = ConvertCostTableLookup(Tbl, ISD, + DstVT, + SrcVT)) + return Entry->Cost * SplitFactor + ExtraSplitCost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); } Index: ../test/Analysis/CostModel/X86/cast.ll =================================================================== --- ../test/Analysis/CostModel/X86/cast.ll +++ ../test/Analysis/CostModel/X86/cast.ll @@ -37,6 +37,7 @@ define i32 @zext_sext(<8 x i1> %in) { ; CHECK-AVX2-LABEL: for function 'zext_sext' ; CHECK-AVX-LABEL: for function 'zext_sext' +; CHECK-AVX512-LABEL: for function 'zext_sext' ;CHECK-AVX2: cost of 3 {{.*}} zext ;CHECK-AVX: cost of 4 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> @@ -114,7 +115,7 @@ %F3 = trunc <4 x i64> undef to <4 x i8> ;CHECK-AVX2: cost of 4 {{.*}} trunc - ;CHECK-AVX: cost of 9 {{.*}} trunc + ;CHECK-AVX: cost of 8 {{.*}} trunc ;CHECK_AVX512: cost of 1 {{.*}} G = trunc %G = trunc <8 x i64> undef to <8 x i32> Index: ../test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- ../test/Analysis/CostModel/X86/sitofp.ll +++ ../test/Analysis/CostModel/X86/sitofp.ll @@ -5,23 +5,23 @@ define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) { ; SSE2-LABEL: sitofpv2i8v2double - ; SSE2: cost of 20 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i8v2double - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 3 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i8v2double - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 3 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i8v2double - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <2 x i8> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i8v4double(<4 x i8> %a) { ; SSE2-LABEL: sitofpv4i8v4double - ; SSE2: cost of 40 {{.*}} sitofp + ; SSE2: cost of 6 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i8v4double ; AVX1: cost of 3 {{.*}} sitofp @@ -37,13 +37,13 @@ define <8 x double> @sitofpv8i8v8double(<8 x i8> %a) { ; SSE2-LABEL: sitofpv8i8v8double - ; SSE2: cost of 80 {{.*}} sitofp + ; SSE2: cost of 12 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i8v8double - ; AVX1: cost of 20 {{.*}} sitofp + ; AVX1: cost of 6 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i8v8double - ; AVX2: cost of 20 {{.*}} sitofp + ; AVX2: cost of 6 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i8v8double ; AVX512F: cost of 2 {{.*}} sitofp @@ -53,55 +53,55 @@ define <16 x double> @sitofpv16i8v16double(<16 x i8> %a) { ; SSE2-LABEL: sitofpv16i8v16double - ; SSE2: cost of 160 {{.*}} sitofp + ; SSE2: cost of 24 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i8v16double - ; AVX1: cost of 40 {{.*}} sitofp + ; AVX1: cost of 12 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i8v16double - ; AVX2: cost of 40 {{.*}} sitofp + ; AVX2: cost of 12 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i8v16double - ; AVX512F: cost of 44 {{.*}} sitofp + ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i8v32double(<32 x i8> %a) { ; SSE2-LABEL: sitofpv32i8v32double - ; SSE2: cost of 320 {{.*}} sitofp + ; SSE2: cost of 48 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i8v32double - ; AVX1: cost of 80 {{.*}} sitofp + ; AVX1: cost of 24 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i8v32double - ; AVX2: cost of 80 {{.*}} sitofp + ; AVX2: cost of 24 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i8v32double - ; AVX512F: cost of 88 {{.*}} sitofp + ; AVX512F: cost of 8 {{.*}} sitofp %1 = sitofp <32 x i8> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @sitofpv2i16v2double(<2 x i16> %a) { ; SSE2-LABEL: sitofpv2i16v2double - ; SSE2: cost of 20 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i16v2double - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 3 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i16v2double - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 3 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i16v2double - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <2 x i16> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i16v4double(<4 x i16> %a) { ; SSE2-LABEL: sitofpv4i16v4double - ; SSE2: cost of 40 {{.*}} sitofp + ; SSE2: cost of 6 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i16v4double ; AVX1: cost of 3 {{.*}} sitofp @@ -117,13 +117,13 @@ define <8 x double> @sitofpv8i16v8double(<8 x i16> %a) { ; SSE2-LABEL: sitofpv8i16v8double - ; SSE2: cost of 80 {{.*}} sitofp + ; SSE2: cost of 12 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i16v8double - ; AVX1: cost of 20 {{.*}} sitofp + ; AVX1: cost of 6 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i16v8double - ; AVX2: cost of 20 {{.*}} sitofp + ; AVX2: cost of 6 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i16v8double ; AVX512F: cost of 2 {{.*}} sitofp @@ -133,55 +133,55 @@ define <16 x double> @sitofpv16i16v16double(<16 x i16> %a) { ; SSE2-LABEL: sitofpv16i16v16double - ; SSE2: cost of 160 {{.*}} sitofp + ; SSE2: cost of 24 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i16v16double - ; AVX1: cost of 40 {{.*}} sitofp + ; AVX1: cost of 12 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i16v16double - ; AVX2: cost of 40 {{.*}} sitofp + ; AVX2: cost of 12 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i16v16double - ; AVX512F: cost of 44 {{.*}} sitofp + ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i16v32double(<32 x i16> %a) { ; SSE2-LABEL: sitofpv32i16v32double - ; SSE2: cost of 320 {{.*}} sitofp + ; SSE2: cost of 48 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i16v32double - ; AVX1: cost of 80 {{.*}} sitofp + ; AVX1: cost of 24 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i16v32double - ; AVX2: cost of 80 {{.*}} sitofp + ; AVX2: cost of 24 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i16v32double - ; AVX512F: cost of 88 {{.*}} sitofp + ; AVX512F: cost of 8 {{.*}} sitofp %1 = sitofp <32 x i16> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @sitofpv2i32v2double(<2 x i32> %a) { ; SSE2-LABEL: sitofpv2i32v2double - ; SSE2: cost of 20 {{.*}} sitofp + ; SSE2: cost of 1 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i32v2double - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 1 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i32v2double - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 1 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i32v2double - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <2 x i32> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @sitofpv4i32v4double(<4 x i32> %a) { ; SSE2-LABEL: sitofpv4i32v4double - ; SSE2: cost of 40 {{.*}} sitofp + ; SSE2: cost of 2 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i32v4double ; AVX1: cost of 1 {{.*}} sitofp @@ -197,13 +197,13 @@ define <8 x double> @sitofpv8i32v8double(<8 x i32> %a) { ; SSE2-LABEL: sitofpv8i32v8double - ; SSE2: cost of 80 {{.*}} sitofp + ; SSE2: cost of 4 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i32v8double - ; AVX1: cost of 20 {{.*}} sitofp + ; AVX1: cost of 2 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i32v8double - ; AVX2: cost of 20 {{.*}} sitofp + ; AVX2: cost of 2 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i32v8double ; AVX512F: cost of 1 {{.*}} sitofp @@ -213,32 +213,32 @@ define <16 x double> @sitofpv16i32v16double(<16 x i32> %a) { ; SSE2-LABEL: sitofpv16i32v16double - ; SSE2: cost of 160 {{.*}} sitofp + ; SSE2: cost of 8 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i32v16double - ; AVX1: cost of 40 {{.*}} sitofp + ; AVX1: cost of 4 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i32v16double - ; AVX2: cost of 40 {{.*}} sitofp + ; AVX2: cost of 4 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i32v16double - ; AVX512F: cost of 44 {{.*}} sitofp + ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @sitofpv32i32v32double(<32 x i32> %a) { ; SSE2-LABEL: sitofpv32i32v32double - ; SSE2: cost of 320 {{.*}} sitofp + ; SSE2: cost of 16 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i32v32double - ; AVX1: cost of 80 {{.*}} sitofp + ; AVX1: cost of 8 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i32v32double - ; AVX2: cost of 80 {{.*}} sitofp + ; AVX2: cost of 8 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i32v32double - ; AVX512F: cost of 88 {{.*}} sitofp + ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <32 x i32> %a to <32 x double> ret <32 x double> %1 } @@ -264,13 +264,13 @@ ; SSE2: cost of 40 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i64v4double - ; AVX1: cost of 10 {{.*}} sitofp + ; AVX1: cost of 13 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv4i64v4double - ; AVX2: cost of 10 {{.*}} sitofp + ; AVX2: cost of 13 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv4i64v4double - ; AVX512F: cost of 10 {{.*}} sitofp + ; AVX512F: cost of 13 {{.*}} sitofp %1 = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %1 } @@ -280,13 +280,13 @@ ; SSE2: cost of 80 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i64v8double - ; AVX1: cost of 20 {{.*}} sitofp + ; AVX1: cost of 26 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i64v8double - ; AVX2: cost of 20 {{.*}} sitofp + ; AVX2: cost of 26 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i64v8double - ; AVX512F: cost of 22 {{.*}} sitofp + ; AVX512F: cost of 26 {{.*}} sitofp %1 = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %1 } @@ -296,13 +296,13 @@ ; SSE2: cost of 160 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i64v16double - ; AVX1: cost of 40 {{.*}} sitofp + ; AVX1: cost of 52 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i64v16double - ; AVX2: cost of 40 {{.*}} sitofp + ; AVX2: cost of 52 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i64v16double - ; AVX512F: cost of 44 {{.*}} sitofp + ; AVX512F: cost of 52 {{.*}} sitofp %1 = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %1 } @@ -312,36 +312,36 @@ ; SSE2: cost of 320 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i64v32double - ; AVX1: cost of 80 {{.*}} sitofp + ; AVX1: cost of 104 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i64v32double - ; AVX2: cost of 80 {{.*}} sitofp + ; AVX2: cost of 104 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i64v32double - ; AVX512F: cost of 88 {{.*}} sitofp + ; AVX512F: cost of 104 {{.*}} sitofp %1 = sitofp <32 x i64> %a to <32 x double> ret <32 x double> %1 } define <2 x float> @sitofpv2i8v2float(<2 x i8> %a) { ; SSE2-LABEL: sitofpv2i8v2float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i8v2float - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 3 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i8v2float - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 3 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i8v2float - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i8v4float(<4 x i8> %a) { ; SSE2-LABEL: sitofpv4i8v4float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i8v4float ; AVX1: cost of 3 {{.*}} sitofp @@ -357,7 +357,7 @@ define <8 x float> @sitofpv8i8v8float(<8 x i8> %a) { ; SSE2-LABEL: sitofpv8i8v8float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 6 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i8v8float ; AVX1: cost of 8 {{.*}} sitofp @@ -373,13 +373,13 @@ define <16 x float> @sitofpv16i8v16float(<16 x i8> %a) { ; SSE2-LABEL: sitofpv16i8v16float - ; SSE2: cost of 8 {{.*}} sitofp + ; SSE2: cost of 12 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i8v16float - ; AVX1: cost of 44 {{.*}} sitofp + ; AVX1: cost of 16 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i8v16float - ; AVX2: cost of 44 {{.*}} sitofp + ; AVX2: cost of 16 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i8v16float ; AVX512F: cost of 2 {{.*}} sitofp @@ -389,39 +389,39 @@ define <32 x float> @sitofpv32i8v32float(<32 x i8> %a) { ; SSE2-LABEL: sitofpv32i8v32float - ; SSE2: cost of 16 {{.*}} sitofp + ; SSE2: cost of 24 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i8v32float - ; AVX1: cost of 88 {{.*}} sitofp + ; AVX1: cost of 32 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i8v32float - ; AVX2: cost of 88 {{.*}} sitofp + ; AVX2: cost of 32 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i8v32float - ; AVX512F: cost of 92 {{.*}} sitofp + ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <32 x i8> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i16v2float(<2 x i16> %a) { ; SSE2-LABEL: sitofpv2i16v2float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i16v2float - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 3 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i16v2float - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 3 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i16v2float - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 3 {{.*}} sitofp %1 = sitofp <2 x i16> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i16v4float(<4 x i16> %a) { ; SSE2-LABEL: sitofpv4i16v4float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 3 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i16v4float ; AVX1: cost of 3 {{.*}} sitofp @@ -437,7 +437,7 @@ define <8 x float> @sitofpv8i16v8float(<8 x i16> %a) { ; SSE2-LABEL: sitofpv8i16v8float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 6 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i16v8float ; AVX1: cost of 5 {{.*}} sitofp @@ -453,13 +453,13 @@ define <16 x float> @sitofpv16i16v16float(<16 x i16> %a) { ; SSE2-LABEL: sitofpv16i16v16float - ; SSE2: cost of 30 {{.*}} sitofp + ; SSE2: cost of 12 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i16v16float - ; AVX1: cost of 44 {{.*}} sitofp + ; AVX1: cost of 10 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i16v16float - ; AVX2: cost of 44 {{.*}} sitofp + ; AVX2: cost of 10 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i16v16float ; AVX512F: cost of 2 {{.*}} sitofp @@ -469,39 +469,39 @@ define <32 x float> @sitofpv32i16v32float(<32 x i16> %a) { ; SSE2-LABEL: sitofpv32i16v32float - ; SSE2: cost of 60 {{.*}} sitofp + ; SSE2: cost of 24 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i16v32float - ; AVX1: cost of 88 {{.*}} sitofp + ; AVX1: cost of 20 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i16v32float - ; AVX2: cost of 88 {{.*}} sitofp + ; AVX2: cost of 20 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i16v32float - ; AVX512F: cost of 92 {{.*}} sitofp + ; AVX512F: cost of 4 {{.*}} sitofp %1 = sitofp <32 x i16> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i32v2float(<2 x i32> %a) { ; SSE2-LABEL: sitofpv2i32v2float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 1 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i32v2float - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 1 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i32v2float - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 1 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i32v2float - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 1 {{.*}} sitofp %1 = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i32v4float(<4 x i32> %a) { ; SSE2-LABEL: sitofpv4i32v4float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 1 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i32v4float ; AVX1: cost of 1 {{.*}} sitofp @@ -517,7 +517,7 @@ define <8 x float> @sitofpv8i32v8float(<8 x i32> %a) { ; SSE2-LABEL: sitofpv8i32v8float - ; SSE2: cost of 30 {{.*}} sitofp + ; SSE2: cost of 2 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i32v8float ; AVX1: cost of 1 {{.*}} sitofp @@ -533,13 +533,13 @@ define <16 x float> @sitofpv16i32v16float(<16 x i32> %a) { ; SSE2-LABEL: sitofpv16i32v16float - ; SSE2: cost of 60 {{.*}} sitofp + ; SSE2: cost of 4 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i32v16float - ; AVX1: cost of 44 {{.*}} sitofp + ; AVX1: cost of 2 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i32v16float - ; AVX2: cost of 44 {{.*}} sitofp + ; AVX2: cost of 2 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i32v16float ; AVX512F: cost of 1 {{.*}} sitofp @@ -549,109 +549,109 @@ define <32 x float> @sitofpv32i32v32float(<32 x i32> %a) { ; SSE2-LABEL: sitofpv32i32v32float - ; SSE2: cost of 120 {{.*}} sitofp + ; SSE2: cost of 8 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i32v32float - ; AVX1: cost of 88 {{.*}} sitofp + ; AVX1: cost of 4 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i32v32float - ; AVX2: cost of 88 {{.*}} sitofp + ; AVX2: cost of 4 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i32v32float - ; AVX512F: cost of 92 {{.*}} sitofp + ; AVX512F: cost of 2 {{.*}} sitofp %1 = sitofp <32 x i32> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @sitofpv2i64v2float(<2 x i64> %a) { ; SSE2-LABEL: sitofpv2i64v2float - ; SSE2: cost of 15 {{.*}} sitofp + ; SSE2: cost of 7 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i64v2float - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 7 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i64v2float - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 7 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i64v2float - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 7 {{.*}} sitofp %1 = sitofp <2 x i64> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @sitofpv4i64v4float(<4 x i64> %a) { ; SSE2-LABEL: sitofpv4i64v4float - ; SSE2: cost of 30 {{.*}} sitofp + ; SSE2: cost of 14 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv4i64v4float - ; AVX1: cost of 10 {{.*}} sitofp + ; AVX1: cost of 13 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv4i64v4float - ; AVX2: cost of 10 {{.*}} sitofp + ; AVX2: cost of 13 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv4i64v4float - ; AVX512F: cost of 10 {{.*}} sitofp + ; AVX512F: cost of 13 {{.*}} sitofp %1 = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @sitofpv8i64v8float(<8 x i64> %a) { ; SSE2-LABEL: sitofpv8i64v8float - ; SSE2: cost of 60 {{.*}} sitofp + ; SSE2: cost of 28 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i64v8float - ; AVX1: cost of 22 {{.*}} sitofp + ; AVX1: cost of 26 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i64v8float - ; AVX2: cost of 22 {{.*}} sitofp + ; AVX2: cost of 26 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i64v8float - ; AVX512F: cost of 22 {{.*}} sitofp + ; AVX512F: cost of 26 {{.*}} sitofp %1 = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @sitofpv16i64v16float(<16 x i64> %a) { ; SSE2-LABEL: sitofpv16i64v16float - ; SSE2: cost of 120 {{.*}} sitofp + ; SSE2: cost of 56 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i64v16float - ; AVX1: cost of 44 {{.*}} sitofp + ; AVX1: cost of 52 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i64v16float - ; AVX2: cost of 44 {{.*}} sitofp + ; AVX2: cost of 52 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i64v16float - ; AVX512F: cost of 46 {{.*}} sitofp + ; AVX512F: cost of 52 {{.*}} sitofp %1 = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @sitofpv32i64v32float(<32 x i64> %a) { ; SSE2-LABEL: sitofpv32i64v32float - ; SSE2: cost of 240 {{.*}} sitofp + ; SSE2: cost of 112 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv32i64v32float - ; AVX1: cost of 88 {{.*}} sitofp + ; AVX1: cost of 104 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv32i64v32float - ; AVX2: cost of 88 {{.*}} sitofp + ; AVX2: cost of 104 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv32i64v32float - ; AVX512F: cost of 92 {{.*}} sitofp + ; AVX512F: cost of 104 {{.*}} sitofp %1 = sitofp <32 x i64> %a to <32 x float> ret <32 x float> %1 } define <8 x double> @sitofpv8i1v8double(<8 x double> %a) { ; SSE2-LABEL: sitofpv8i1v8double - ; SSE2: cost of 80 {{.*}} sitofp + ; SSE2: cost of 16 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv8i1v8double - ; AVX1: cost of 20 {{.*}} sitofp + ; AVX1: cost of 6 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv8i1v8double - ; AVX2: cost of 20 {{.*}} sitofp + ; AVX2: cost of 6 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv8i1v8double ; AVX512F: cost of 4 {{.*}} sitofp @@ -662,13 +662,13 @@ define <16 x float> @sitofpv16i1v16float(<16 x float> %a) { ; SSE2-LABEL: sitofpv16i1v16float - ; SSE2: cost of 8 {{.*}} sitofp + ; SSE2: cost of 40 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv16i1v16float - ; AVX1: cost of 44 {{.*}} sitofp + ; AVX1: cost of 16 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv16i1v16float - ; AVX2: cost of 44 {{.*}} sitofp + ; AVX2: cost of 16 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv16i1v16float ; AVX512F: cost of 3 {{.*}} sitofp Index: ../test/Analysis/CostModel/X86/sse-itoi.ll =================================================================== --- ../test/Analysis/CostModel/X86/sse-itoi.ll +++ ../test/Analysis/CostModel/X86/sse-itoi.ll @@ -3,7 +3,7 @@ define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { ; SSE2: zext_v16i16_to_v16i32 -; SSE2: cost of 6 {{.*}} zext +; SSE2: cost of 5 {{.*}} zext ; ; SSE41: zext_v16i16_to_v16i32 ; SSE41: cost of 4 {{.*}} zext @@ -16,7 +16,7 @@ define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { ; SSE2: sext_v16i16_to_v16i32 -; SSE2: cost of 8 {{.*}} sext +; SSE2: cost of 9 {{.*}} sext ; ; SSE41: sext_v16i16_to_v16i32 ; SSE41: cost of 4 {{.*}} sext @@ -42,7 +42,7 @@ define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { ; SSE2: sext_v8i16_to_v8i32 -; SSE2: cost of 4 {{.*}} sext +; SSE2: cost of 5 {{.*}} sext ; ; SSE41: sext_v8i16_to_v8i32 ; SSE41: cost of 2 {{.*}} sext @@ -81,7 +81,7 @@ define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { ; SSE2: zext_v16i8_to_v16i32 -; SSE2: cost of 9 {{.*}} zext +; SSE2: cost of 11 {{.*}} zext ; ; SSE41: zext_v16i8_to_v16i32 ; SSE41: cost of 4 {{.*}} zext @@ -94,7 +94,7 @@ define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { ; SSE2: sext_v16i8_to_v16i32 -; SSE2: cost of 12 {{.*}} sext +; SSE2: cost of 15 {{.*}} sext ; ; SSE41: sext_v16i8_to_v16i32 ; SSE41: cost of 4 {{.*}} sext @@ -107,7 +107,7 @@ define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { ; SSE2: zext_v8i8_to_v8i32 -; SSE2: cost of 6 {{.*}} zext +; SSE2: cost of 5 {{.*}} zext ; ; SSE41: zext_v8i8_to_v8i32 ; SSE41: cost of 2 {{.*}} zext Index: ../test/Analysis/CostModel/X86/uitofp.ll =================================================================== --- ../test/Analysis/CostModel/X86/uitofp.ll +++ ../test/Analysis/CostModel/X86/uitofp.ll @@ -6,13 +6,13 @@ define <2 x double> @uitofpv2i8v2double(<2 x i8> %a) { ; SSE2-LABEL: uitofpv2i8v2double - ; SSE2: cost of 20 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i8v2double - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 2 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i8v2double - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 2 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i8v2double ; AVX512F: cost of 2 {{.*}} uitofp @@ -22,7 +22,7 @@ define <4 x double> @uitofpv4i8v4double(<4 x i8> %a) { ; SSE2-LABEL: uitofpv4i8v4double - ; SSE2: cost of 40 {{.*}} uitofp + ; SSE2: cost of 4 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i8v4double ; AVX1: cost of 2 {{.*}} uitofp @@ -38,13 +38,13 @@ define <8 x double> @uitofpv8i8v8double(<8 x i8> %a) { ; SSE2-LABEL: uitofpv8i8v8double - ; SSE2: cost of 80 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i8v8double - ; AVX1: cost of 20 {{.*}} uitofp + ; AVX1: cost of 4 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv8i8v8double - ; AVX2: cost of 20 {{.*}} uitofp + ; AVX2: cost of 4 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv8i8v8double ; AVX512F: cost of 2 {{.*}} uitofp @@ -54,55 +54,55 @@ define <16 x double> @uitofpv16i8v16double(<16 x i8> %a) { ; SSE2-LABEL: uitofpv16i8v16double - ; SSE2: cost of 160 {{.*}} uitofp + ; SSE2: cost of 16 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i8v16double - ; AVX1: cost of 40 {{.*}} uitofp + ; AVX1: cost of 8 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i8v16double - ; AVX2: cost of 40 {{.*}} uitofp + ; AVX2: cost of 8 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i8v16double - ; AVX512F: cost of 44 {{.*}} uitofp + ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i8v32double(<32 x i8> %a) { ; SSE2-LABEL: uitofpv32i8v32double - ; SSE2: cost of 320 {{.*}} uitofp + ; SSE2: cost of 32 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i8v32double - ; AVX1: cost of 80 {{.*}} uitofp + ; AVX1: cost of 16 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i8v32double - ; AVX2: cost of 80 {{.*}} uitofp + ; AVX2: cost of 16 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i8v32double - ; AVX512F: cost of 88 {{.*}} uitofp + ; AVX512F: cost of 8 {{.*}} uitofp %1 = uitofp <32 x i8> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @uitofpv2i16v2double(<2 x i16> %a) { ; SSE2-LABEL: uitofpv2i16v2double - ; SSE2: cost of 20 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i16v2double - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 2 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i16v2double - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 2 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i16v2double - ; AVX512F: cost of 5 {{.*}} uitofp + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <2 x i16> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i16v4double(<4 x i16> %a) { ; SSE2-LABEL: uitofpv4i16v4double - ; SSE2: cost of 40 {{.*}} uitofp + ; SSE2: cost of 4 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i16v4double ; AVX1: cost of 2 {{.*}} uitofp @@ -118,13 +118,13 @@ define <8 x double> @uitofpv8i16v8double(<8 x i16> %a) { ; SSE2-LABEL: uitofpv8i16v8double - ; SSE2: cost of 80 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i16v8double - ; AVX1: cost of 20 {{.*}} uitofp + ; AVX1: cost of 4 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv8i16v8double - ; AVX2: cost of 20 {{.*}} uitofp + ; AVX2: cost of 4 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv8i16v8double ; AVX512F: cost of 2 {{.*}} uitofp @@ -134,55 +134,55 @@ define <16 x double> @uitofpv16i16v16double(<16 x i16> %a) { ; SSE2-LABEL: uitofpv16i16v16double - ; SSE2: cost of 160 {{.*}} uitofp + ; SSE2: cost of 16 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i16v16double - ; AVX1: cost of 40 {{.*}} uitofp + ; AVX1: cost of 8 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i16v16double - ; AVX2: cost of 40 {{.*}} uitofp + ; AVX2: cost of 8 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i16v16double - ; AVX512F: cost of 44 {{.*}} uitofp + ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i16v32double(<32 x i16> %a) { ; SSE2-LABEL: uitofpv32i16v32double - ; SSE2: cost of 320 {{.*}} uitofp + ; SSE2: cost of 32 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i16v32double - ; AVX1: cost of 80 {{.*}} uitofp + ; AVX1: cost of 16 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i16v32double - ; AVX2: cost of 80 {{.*}} uitofp + ; AVX2: cost of 16 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i16v32double - ; AVX512F: cost of 88 {{.*}} uitofp + ; AVX512F: cost of 8 {{.*}} uitofp %1 = uitofp <32 x i16> %a to <32 x double> ret <32 x double> %1 } define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) { ; SSE2-LABEL: uitofpv2i32v2double - ; SSE2: cost of 20 {{.*}} uitofp + ; SSE2: cost of 15 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i32v2double - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 6 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i32v2double - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 6 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i32v2double - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F: cost of 1 {{.*}} uitofp %1 = uitofp <2 x i32> %a to <2 x double> ret <2 x double> %1 } define <4 x double> @uitofpv4i32v4double(<4 x i32> %a) { ; SSE2-LABEL: uitofpv4i32v4double - ; SSE2: cost of 40 {{.*}} uitofp + ; SSE2: cost of 30 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i32v4double ; AVX1: cost of 6 {{.*}} uitofp @@ -198,13 +198,13 @@ define <8 x double> @uitofpv8i32v8double(<8 x i32> %a) { ; SSE2-LABEL: uitofpv8i32v8double - ; SSE2: cost of 80 {{.*}} uitofp + ; SSE2: cost of 60 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i32v8double - ; AVX1: cost of 20 {{.*}} uitofp + ; AVX1: cost of 12 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv8i32v8double - ; AVX2: cost of 20 {{.*}} uitofp + ; AVX2: cost of 12 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv8i32v8double ; AVX512F: cost of 1 {{.*}} uitofp @@ -214,32 +214,32 @@ define <16 x double> @uitofpv16i32v16double(<16 x i32> %a) { ; SSE2-LABEL: uitofpv16i32v16double - ; SSE2: cost of 160 {{.*}} uitofp + ; SSE2: cost of 120 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i32v16double - ; AVX1: cost of 40 {{.*}} uitofp + ; AVX1: cost of 24 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i32v16double - ; AVX2: cost of 40 {{.*}} uitofp + ; AVX2: cost of 24 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i32v16double - ; AVX512F: cost of 44 {{.*}} uitofp + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %1 } define <32 x double> @uitofpv32i32v32double(<32 x i32> %a) { ; SSE2-LABEL: uitofpv32i32v32double - ; SSE2: cost of 320 {{.*}} uitofp + ; SSE2: cost of 240 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i32v32double - ; AVX1: cost of 80 {{.*}} uitofp + ; AVX1: cost of 48 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i32v32double - ; AVX2: cost of 80 {{.*}} uitofp + ; AVX2: cost of 48 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i32v32double - ; AVX512F: cost of 88 {{.*}} uitofp + ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <32 x i32> %a to <32 x double> ret <32 x double> %1 } @@ -249,10 +249,10 @@ ; SSE2: cost of 20 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i64v2double - ; AVX1: cost of 20 {{.*}} uitofp + ; AVX1: cost of 10 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i64v2double - ; AVX2: cost of 20 {{.*}} uitofp + ; AVX2: cost of 10 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i64v2double ; AVX512F: cost of 5 {{.*}} uitofp @@ -268,10 +268,10 @@ ; SSE2: cost of 40 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i64v4double - ; AVX1: cost of 40 {{.*}} uitofp + ; AVX1: cost of 20 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv4i64v4double - ; AVX2: cost of 40 {{.*}} uitofp + ; AVX2: cost of 20 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv4i64v4double ; AVX512F: cost of 12 {{.*}} uitofp @@ -287,10 +287,10 @@ ; SSE2: cost of 80 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i64v8double - ; AVX1: cost of 20 {{.*}} uitofp + ; AVX1: cost of 40 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv8i64v8double - ; AVX2: cost of 20 {{.*}} uitofp + ; AVX2: cost of 40 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv8i64v8double ; AVX512F: cost of 26 {{.*}} uitofp @@ -306,16 +306,16 @@ ; SSE2: cost of 160 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i64v16double - ; AVX1: cost of 40 {{.*}} uitofp + ; AVX1: cost of 80 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i64v16double - ; AVX2: cost of 40 {{.*}} uitofp + ; AVX2: cost of 80 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i64v16double - ; AVX512F: cost of 44 {{.*}} uitofp + ; AVX512F: cost of 52 {{.*}} uitofp ; ; AVX512DQ: uitofpv16i64v16double - ; AVX512DQ: cost of 44 {{.*}} uitofp + ; AVX512DQ: cost of 2 {{.*}} uitofp %1 = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %1 } @@ -325,39 +325,39 @@ ; SSE2: cost of 320 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i64v32double - ; AVX1: cost of 80 {{.*}} uitofp + ; AVX1: cost of 160 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i64v32double - ; AVX2: cost of 80 {{.*}} uitofp + ; AVX2: cost of 160 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i64v32double - ; AVX512F: cost of 88 {{.*}} uitofp + ; AVX512F: cost of 104 {{.*}} uitofp ; ; AVX512DQ: uitofpv32i64v32double - ; AVX512DQ: cost of 88 {{.*}} uitofp + ; AVX512DQ: cost of 4 {{.*}} uitofp %1 = uitofp <32 x i64> %a to <32 x double> ret <32 x double> %1 } define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) { ; SSE2-LABEL: uitofpv2i8v2float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i8v2float - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 2 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i8v2float - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 2 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i8v2float - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) { ; SSE2-LABEL: uitofpv4i8v4float - ; SSE2: cost of 8 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i8v4float ; AVX1: cost of 2 {{.*}} uitofp @@ -373,7 +373,7 @@ define <8 x float> @uitofpv8i8v8float(<8 x i8> %a) { ; SSE2-LABEL: uitofpv8i8v8float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 4 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i8v8float ; AVX1: cost of 5 {{.*}} uitofp @@ -392,10 +392,10 @@ ; SSE2: cost of 8 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i8v16float - ; AVX1: cost of 44 {{.*}} uitofp + ; AVX1: cost of 10 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i8v16float - ; AVX2: cost of 44 {{.*}} uitofp + ; AVX2: cost of 10 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i8v16float ; AVX512F: cost of 2 {{.*}} uitofp @@ -408,36 +408,36 @@ ; SSE2: cost of 16 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i8v32float - ; AVX1: cost of 88 {{.*}} uitofp + ; AVX1: cost of 20 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i8v32float - ; AVX2: cost of 88 {{.*}} uitofp + ; AVX2: cost of 20 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i8v32float - ; AVX512F: cost of 92 {{.*}} uitofp + ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <32 x i8> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) { ; SSE2-LABEL: uitofpv2i16v2float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i16v2float - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 2 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i16v2float - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 2 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i16v2float - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <2 x i16> %a to <2 x float> ret <2 x float> %1 } define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) { ; SSE2-LABEL: uitofpv4i16v4float - ; SSE2: cost of 8 {{.*}} uitofp + ; SSE2: cost of 2 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv4i16v4float ; AVX1: cost of 2 {{.*}} uitofp @@ -453,7 +453,7 @@ define <8 x float> @uitofpv8i16v8float(<8 x i16> %a) { ; SSE2-LABEL: uitofpv8i16v8float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 4 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv8i16v8float ; AVX1: cost of 5 {{.*}} uitofp @@ -469,13 +469,13 @@ define <16 x float> @uitofpv16i16v16float(<16 x i16> %a) { ; SSE2-LABEL: uitofpv16i16v16float - ; SSE2: cost of 30 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i16v16float - ; AVX1: cost of 44 {{.*}} uitofp + ; AVX1: cost of 10 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i16v16float - ; AVX2: cost of 44 {{.*}} uitofp + ; AVX2: cost of 10 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i16v16float ; AVX512F: cost of 2 {{.*}} uitofp @@ -485,29 +485,29 @@ define <32 x float> @uitofpv32i16v32float(<32 x i16> %a) { ; SSE2-LABEL: uitofpv32i16v32float - ; SSE2: cost of 60 {{.*}} uitofp + ; SSE2: cost of 16 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i16v32float - ; AVX1: cost of 88 {{.*}} uitofp + ; AVX1: cost of 20 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i16v32float - ; AVX2: cost of 88 {{.*}} uitofp + ; AVX2: cost of 20 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i16v32float - ; AVX512F: cost of 92 {{.*}} uitofp + ; AVX512F: cost of 4 {{.*}} uitofp %1 = uitofp <32 x i16> %a to <32 x float> ret <32 x float> %1 } define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) { ; SSE2-LABEL: uitofpv2i32v2float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i32v2float - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 8 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i32v2float - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 8 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i32v2float ; AVX512F: cost of 2 {{.*}} uitofp @@ -552,10 +552,10 @@ ; SSE2: cost of 32 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv16i32v16float - ; AVX1: cost of 44 {{.*}} uitofp + ; AVX1: cost of 18 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv16i32v16float - ; AVX2: cost of 44 {{.*}} uitofp + ; AVX2: cost of 16 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv16i32v16float ; AVX512F: cost of 1 {{.*}} uitofp @@ -568,13 +568,13 @@ ; SSE2: cost of 64 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv32i32v32float - ; AVX1: cost of 88 {{.*}} uitofp + ; AVX1: cost of 36 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv32i32v32float - ; AVX2: cost of 88 {{.*}} uitofp + ; AVX2: cost of 32 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv32i32v32float - ; AVX512F: cost of 92 {{.*}} uitofp + ; AVX512F: cost of 2 {{.*}} uitofp %1 = uitofp <32 x i32> %a to <32 x float> ret <32 x float> %1 } @@ -584,13 +584,13 @@ ; SSE2: cost of 15 {{.*}} uitofp ; ; AVX1-LABEL: uitofpv2i64v2float - ; AVX1: cost of 4 {{.*}} uitofp + ; AVX1: cost of 15 {{.*}} uitofp ; ; AVX2-LABEL: uitofpv2i64v2float - ; AVX2: cost of 4 {{.*}} uitofp + ; AVX2: cost of 15 {{.*}} uitofp ; ; AVX512F-LABEL: uitofpv2i64v2float - ; AVX512F: cost of 4 {{.*}} uitofp + ; AVX512F: cost of 5 {{.*}} uitofp %1 = uitofp <2 x i64> %a to <2 x float> ret <2 x float> %1 } Index: ../test/Transforms/LoopVectorize/X86/conversion-cost.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ ../test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -25,7 +25,7 @@ } ;CHECK-LABEL: @conversion_cost2( -;CHECK: <2 x float> +;CHECK: <8 x float> ;CHECK: ret define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 9 Index: ../test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll +++ ../test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll @@ -5,8 +5,8 @@ target triple = "x86_64-apple-macosx10.8.0" -; CHECK: cost of 20 for VF 2 For instruction: %conv = uitofp i64 %tmp to double -; CHECK: cost of 40 for VF 4 For instruction: %conv = uitofp i64 %tmp to double +; CHECK: cost of 10 for VF 2 For instruction: %conv = uitofp i64 %tmp to double +; CHECK: cost of 20 for VF 4 For instruction: %conv = uitofp i64 %tmp to double define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind { entry: br label %for.body