Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -528,6 +528,9 @@ int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, @@ -705,7 +708,38 @@ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; - static const TypeConversionCostTblEntry SSE2ConvTbl[] = { + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { // These are somewhat magic numbers justified by looking at the output of // Intel's IACA, running some kernels and making sure when we take // legalization into account the throughput will be overestimated. @@ -726,13 +760,42 @@ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 14 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 7 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 31 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, }; std::pair LTSrc = TLI->getTypeLegalizationCost(DL, Src); std::pair LTDest = TLI->getTypeLegalizationCost(DL, Dst); if (ST->hasSSE2() && !ST->hasAVX()) { - if (const auto *Entry = ConvertCostTableLookup(SSE2ConvTbl, ISD, + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, LTDest.second, LTSrc.second)) return LTSrc.first * Entry->Cost; } @@ -770,6 +833,20 @@ return Entry->Cost; } + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + return BaseT::getCastInstrCost(Opcode, Dst, Src); } Index: llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll +++ llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll @@ -248,13 +248,13 @@ ; SSE2: cost of 20 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i64v2double - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 20 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i64v2double - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 20 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i64v2double - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 20 {{.*}} sitofp %1 = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %1 } Index: llvm/trunk/test/Analysis/CostModel/X86/sse-itoi.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/sse-itoi.ll +++ llvm/trunk/test/Analysis/CostModel/X86/sse-itoi.ll @@ -0,0 +1,353 @@ +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s + +define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { +; SSE2: zext_v16i16_to_v16i32 +; SSE2: cost of 6 {{.*}} zext +; +; SSE41: zext_v16i16_to_v16i32 +; SSE41: cost of 4 {{.*}} zext +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = zext <16 x i16> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { +; SSE2: sext_v16i16_to_v16i32 +; SSE2: cost of 8 {{.*}} sext +; +; SSE41: sext_v16i16_to_v16i32 +; SSE41: cost of 4 {{.*}} sext +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = sext <16 x i16> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @zext_v8i16_to_v8i32(<8 x i16>* %a) { +; SSE2: zext_v8i16_to_v8i32 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v8i16_to_v8i32 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = zext <8 x i16> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { +; SSE2: sext_v8i16_to_v8i32 +; SSE2: cost of 4 {{.*}} sext +; +; SSE41: sext_v8i16_to_v8i32 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = sext <8 x i16> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @zext_v4i16_to_v4i32(<4 x i16>* %a) { +; SSE2: zext_v4i16_to_v4i32 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v4i16_to_v4i32 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @sext_v4i16_to_v4i32(<4 x i16>* %a) { +; SSE2: sext_v4i16_to_v4i32 +; SSE2: cost of 2 {{.*}} sext +; +; SSE41: sext_v4i16_to_v4i32 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = sext <4 x i16> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { +; SSE2: zext_v16i8_to_v16i32 +; SSE2: cost of 9 {{.*}} zext +; +; SSE41: zext_v16i8_to_v16i32 +; SSE41: cost of 4 {{.*}} zext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { +; SSE2: sext_v16i8_to_v16i32 +; SSE2: cost of 12 {{.*}} sext +; +; SSE41: sext_v16i8_to_v16i32 +; SSE41: cost of 4 {{.*}} sext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = sext <16 x i8> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { +; SSE2: zext_v8i8_to_v8i32 +; SSE2: cost of 6 {{.*}} zext +; +; SSE41: zext_v8i8_to_v8i32 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @sext_v8i8_to_v8i32(<8 x i8>* %a) { +; SSE2: sext_v8i8_to_v8i32 +; SSE2: cost of 6 {{.*}} sext +; +; SSE41: sext_v8i8_to_v8i32 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = sext <8 x i8> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @zext_v4i8_to_v4i32(<4 x i8>* %a) { +; SSE2: zext_v4i8_to_v4i32 +; SSE2: cost of 2 {{.*}} zext +; +; SSE41: zext_v4i8_to_v4i32 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @sext_v4i8_to_v4i32(<4 x i8>* %a) { +; SSE2: sext_v4i8_to_v4i32 +; SSE2: cost of 3 {{.*}} sext +; +; SSE41: sext_v4i8_to_v4i32 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = sext <4 x i8> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @zext_v16i8_to_v16i16(<16 x i8>* %a) { +; SSE2: zext_v16i8_to_v16i16 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v16i8_to_v16i16 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @sext_v16i8_to_v16i16(<16 x i8>* %a) { +; SSE2: sext_v16i8_to_v16i16 +; SSE2: cost of 4 {{.*}} sext +; +; SSE41: sext_v16i8_to_v16i16 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = sext <16 x i8> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @zext_v8i8_to_v8i16(<8 x i8>* %a) { +; SSE2: zext_v8i8_to_v8i16 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v8i8_to_v8i16 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @sext_v8i8_to_v8i16(<8 x i8>* %a) { +; SSE2: sext_v8i8_to_v8i16 +; SSE2: cost of 2 {{.*}} sext +; +; SSE41: sext_v8i8_to_v8i16 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = sext <8 x i8> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @zext_v4i8_to_v4i16(<4 x i8>* %a) { +; SSE2: zext_v4i8_to_v4i16 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v4i8_to_v4i16 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @sext_v4i8_to_v4i16(<4 x i8>* %a) { +; SSE2: sext_v4i8_to_v4i16 +; SSE2: cost of 6 {{.*}} sext +; +; SSE41: sext_v4i8_to_v4i16 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = sext <4 x i8> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) { +; SSE2: truncate_v16i32_to_v16i16 +; SSE2: cost of 14 {{.*}} trunc +; +; SSE41: truncate_v16i32_to_v16i16 +; SSE41: cost of 6 {{.*}} trunc +; + %1 = load <16 x i32>, <16 x i32>* %a + %2 = trunc <16 x i32> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) { +; SSE2: truncate_v8i32_to_v8i16 +; SSE2: cost of 7 {{.*}} trunc +; +; SSE41: truncate_v8i32_to_v8i16 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <8 x i32>, <8 x i32>* %a + %2 = trunc <8 x i32> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) { +; SSE2: truncate_v4i32_to_v4i16 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v4i32_to_v4i16 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = trunc <4 x i32> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) { +; SSE2: truncate_v16i32_to_v16i8 +; SSE2: cost of 31 {{.*}} trunc +; +; SSE41: truncate_v16i32_to_v16i8 +; SSE41: cost of 30 {{.*}} trunc +; + %1 = load <16 x i32>, <16 x i32>* %a + %2 = trunc <16 x i32> %1 to <16 x i8> + store <16 x i8> %2, <16 x i8>* undef, align 4 + ret void +} + +define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) { +; SSE2: truncate_v8i32_to_v8i8 +; SSE2: cost of 4 {{.*}} trunc +; +; SSE41: truncate_v8i32_to_v8i8 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <8 x i32>, <8 x i32>* %a + %2 = trunc <8 x i32> %1 to <8 x i8> + store <8 x i8> %2, <8 x i8>* undef, align 4 + ret void +} + +define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) { +; SSE2: truncate_v4i32_to_v4i8 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v4i32_to_v4i8 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = trunc <4 x i32> %1 to <4 x i8> + store <4 x i8> %2, <4 x i8>* undef, align 4 + ret void +} + +define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) { +; SSE2: truncate_v16i16_to_v16i8 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v16i16_to_v16i8 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = trunc <16 x i16> %1 to <16 x i8> + store <16 x i8> %2, <16 x i8>* undef, align 4 + ret void +} + +define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) { +; SSE2: truncate_v8i16_to_v8i8 +; SSE2: cost of 2 {{.*}} trunc +; +; SSE41: truncate_v8i16_to_v8i8 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = trunc <8 x i16> %1 to <8 x i8> + store <8 x i8> %2, <8 x i8>* undef, align 4 + ret void +} + +define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) { +; SSE2: truncate_v4i16_to_v4i8 +; SSE2: cost of 4 {{.*}} trunc +; +; SSE41: truncate_v4i16_to_v4i8 +; SSE41: cost of 2 {{.*}} trunc +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = trunc <4 x i16> %1 to <4 x i8> + store <4 x i8> %2, <4 x i8>* undef, align 4 + ret void +}