diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1608,16 +1608,50 @@ }; if (ST->hasSSE2()) { - bool IsLoad = - llvm::any_of(Args, [](const auto &V) { return isa(V); }); - if (ST->hasSSE3() && IsLoad) - if (const auto *Entry = - CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { - assert(isLegalBroadcastLoad(BaseTp->getElementType(), - LT.second.getVectorElementCount()) && - "Table entry missing from isLegalBroadcastLoad()"); - return LT.first * Entry->Cost; + if (bool IsLoad = !Args.empty() && isa(Args[0])) { + const LoadInst *L = cast(Args[0]); + // There are two use cases: + // Case 1 (scalar): + // %ld = double ... + // %add1 = fadd double %ld + // %add2 = fadd double %ld + // This is typically used by SLP. + // + // Case 2 (vector): + // %ld = double ... + // %ins = insertelement <2 x double> undef, double %ld, i32 0 + // %shf = shufflevector <2 x double> %ins, ... + // This is the canonicalized form of the broadcast, so we are only + // matching this one and not multiple insertelements. + // + // In order to tell which case we are in we are checking the users of the + // load. If the users are all scalar, then we are in Case 1. Else if there + // is at least one vector user we are in Case 2. + // + // In Case 1 we cannot be sure whether all the users will be converted to + // vectors, and that there won't be any other exteranl user to the load. + // This is up to the caller. So we eagerly consider that codegen will be + // able to combine load + broadcast. + // + // In Case 2 we need to make sure that the load has a single user. + // Otherwise codegen won't be able to combine load + broadcast. + + // Limit the users we visit to save compilation time. + if (!L->hasNUsesOrMore(16)) { + bool IsVectorCase = llvm::any_of(L->users(), [](const User *U) { + return cast(U)->getType()->isVectorTy(); + }); + bool LoadCanBeCombined = IsVectorCase ? L->hasOneUse() : true; + if (ST->hasSSE3() && LoadCanBeCombined) + if (const auto *Entry = + CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { + assert(isLegalBroadcastLoad(BaseTp->getElementType(), + LT.second.getVectorElementCount()) && + "Table entry missing from isLegalBroadcastLoad()"); + return LT.first * Entry->Cost; + } } + } if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll @@ -471,3 +471,49 @@ ret void } + +; Checks the cost of a load+broadcast that cannot be combined. +define void @multiple_uses() { +; SSE-LABEL: 'multiple_uses' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE2-LABEL: 'multiple_uses' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE3-LABEL: 'multiple_uses' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'multiple_uses' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'multiple_uses' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'multiple_uses' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +bb1: + %ld_2xf64 = load <2 x double>, ptr undef + ; Load has multiple uses + %sf_2xf64_1 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer + %sf_2xf64_2 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer + ret void +}