diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MatrixBuilder.h" #include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/X86TargetParser.h" @@ -13324,16 +13325,15 @@ } Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef FeatureStrs) { - return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs)); + uint64_t Mask = llvm::X86::getCpuSupportsMask(FeatureStrs); + std::array FeatureMask{Lo_32(Mask), Hi_32(Mask), 0, 0}; + return EmitX86CpuSupports(FeatureMask); } -llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) { - uint32_t Features1 = Lo_32(FeaturesMask); - uint32_t Features2 = Hi_32(FeaturesMask); - +llvm::Value * +CodeGenFunction::EmitX86CpuSupports(std::array FeatureMask) { Value *Result = Builder.getTrue(); - - if (Features1 != 0) { + if (FeatureMask[0] != 0) { // Matching the struct layout from the compiler-rt/libgcc structure that is // filled in: // unsigned int __cpu_vendor; @@ -13356,22 +13356,26 @@ CharUnits::fromQuantity(4)); // Check the value of the bit corresponding to the feature requested. - Value *Mask = Builder.getInt32(Features1); + Value *Mask = Builder.getInt32(FeatureMask[0]); Value *Bitset = Builder.CreateAnd(Features, Mask); Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); Result = Builder.CreateAnd(Result, Cmp); } - if (Features2 != 0) { - llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty, - "__cpu_features2"); - cast(CpuFeatures2)->setDSOLocal(true); - - Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures2, - CharUnits::fromQuantity(4)); - + llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3); + llvm::Constant *CpuFeatures2 = + CGM.CreateRuntimeVariable(ATy, "__cpu_features2"); + cast(CpuFeatures2)->setDSOLocal(true); + for (int i = 1; i != 4; ++i) { + const uint32_t M = FeatureMask[i]; + if (!M) + continue; + Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)}; + Value *Features = Builder.CreateAlignedLoad( + Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs), + CharUnits::fromQuantity(4)); // Check the value of the bit corresponding to the feature requested. - Value *Mask = Builder.getInt32(Features2); + Value *Mask = Builder.getInt32(M); Value *Bitset = Builder.CreateAnd(Features, Mask); Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); Result = Builder.CreateAnd(Result, Cmp); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4902,7 +4902,7 @@ llvm::Value *EmitX86CpuIs(StringRef CPUStr); llvm::Value *EmitX86CpuSupports(const CallExpr *E); llvm::Value *EmitX86CpuSupports(ArrayRef FeatureStrs); - llvm::Value *EmitX86CpuSupports(uint64_t Mask); + llvm::Value *EmitX86CpuSupports(std::array FeatureMask); llvm::Value *EmitX86CpuInit(); llvm::Value *FormX86ResolverCondition(const MultiVersionResolverOption &RO); llvm::Value *EmitAArch64CpuInit(); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2681,8 +2681,27 @@ const MultiVersionResolverOption &RO) { llvm::Value *Condition = nullptr; - if (!RO.Conditions.Architecture.empty()) - Condition = EmitX86CpuIs(RO.Conditions.Architecture); + if (!RO.Conditions.Architecture.empty()) { + StringRef Arch = RO.Conditions.Architecture; + std::array Mask{}; + // If arch= specifies an x86-64 micro-architecture level, test a special + // feature named FEATURE_X86_64_*, otherwise we use __builtin_cpu_is. + if (Arch.consume_front("x86-64")) { + if (Arch.empty()) // FEATURE_X86_64_BASELINE 95=2*32+31 + Mask[2] = 1u << 31; + else if (Arch == "-v2") // FEATURE_X86_64_V2 96==3*32+0 + Mask[3] = 1u << 0; + else if (Arch == "-v3") // FEATURE_X86_64_V3 97==3*32+1 + Mask[3] = 1u << 1; + else if (Arch == "-v4") // FEATURE_X86_64_V3 98==3*32+2 + Mask[3] = 1u << 2; + else + llvm_unreachable("invalid x86-64 micro-architecture level"); + Condition = EmitX86CpuSupports(Mask); + } else { + Condition = EmitX86CpuIs(Arch); + } + } if (!RO.Conditions.Features.empty()) { llvm::Value *FeatureCond = EmitX86CpuSupports(RO.Conditions.Features); diff --git a/clang/test/CodeGen/attr-target-clones.c b/clang/test/CodeGen/attr-target-clones.c --- a/clang/test/CodeGen/attr-target-clones.c +++ b/clang/test/CodeGen/attr-target-clones.c @@ -13,6 +13,9 @@ // WINDOWS: $foo_inline = comdat any // WINDOWS: $foo_inline2 = comdat any +// LINUX: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] } +// LINUX: @__cpu_features2 = external dso_local global [3 x i32] + // LINUX: @foo.ifunc = weak_odr ifunc i32 (), ptr @foo.resolver // LINUX: @foo_dupes.ifunc = weak_odr ifunc void (), ptr @foo_dupes.resolver // LINUX: @unused.ifunc = weak_odr ifunc void (), ptr @unused.resolver @@ -137,6 +140,28 @@ // WINDOWS: musttail call i32 @foo_used_no_defn.sse4.2.0 // WINDOWS: musttail call i32 @foo_used_no_defn.default.1 +__attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4"))) +int isa_level(int) { return 0; } +// LINUX: define{{.*}} i32 @isa_level.default.4( +// LINUX: define{{.*}} i32 @isa_level.arch_x86-64.0( +// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v2.1( +// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v3.2( +// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v4.3( +// LINUX: define weak_odr ptr @isa_level.resolver() comdat +// LINUX: call void @__cpu_indicator_init() +// LINUX-NEXT: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2) +// LINUX-NEXT: and i32 %[[#]], 4 +// LINUX: ret ptr @isa_level.arch_x86-64-v4.3 +// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2) +// LINUX-NEXT: and i32 %[[#]], 2 +// LINUX: ret ptr @isa_level.arch_x86-64-v3.2 +// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2) +// LINUX-NEXT: and i32 %[[#]], 1 +// LINUX: ret ptr @isa_level.arch_x86-64-v2.1 +// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 1) +// LINUX-NEXT: and i32 %[[#]], -2147483648 +// LINUX: ret ptr @isa_level.arch_x86-64.0 +// LINUX: ret ptr @isa_level.default.4 // Deferred emission of inline definitions. diff --git a/clang/test/CodeGen/builtin-cpu-supports.c b/clang/test/CodeGen/builtin-cpu-supports.c --- a/clang/test/CodeGen/builtin-cpu-supports.c +++ b/clang/test/CodeGen/builtin-cpu-supports.c @@ -5,7 +5,7 @@ extern void a(const char *); // CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] } -// CHECK: @__cpu_features2 = external dso_local global i32 +// CHECK: @__cpu_features2 = external dso_local global [3 x i32] int main(void) { __builtin_cpu_init(); diff --git a/clang/test/Sema/attr-target-clones.c b/clang/test/Sema/attr-target-clones.c --- a/clang/test/Sema/attr-target-clones.c +++ b/clang/test/Sema/attr-target-clones.c @@ -118,3 +118,7 @@ // expected-error@+1 {{attribute 'target_clones' multiversioning cannot be combined with attribute 'overloadable'}} void __attribute__((__overloadable__)) good_overload5(void) __attribute__((target_clones("mmx", "sse4.2", "default"))); void good_overload5(int) __attribute__((target_clones("mmx", "sse4.2", "default"))); + +void good_isa_level(int) __attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4"))); +// expected-warning@+1 {{unsupported CPU 'x86-64-v5' in the 'target_clones' attribute string; 'target_clones' attribute ignored}} +void bad_isa_level(int) __attribute__((target_clones("default", "arch=x86-64-v5"))); diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c --- a/compiler-rt/lib/builtins/cpu_model.c +++ b/compiler-rt/lib/builtins/cpu_model.c @@ -158,6 +158,19 @@ FEATURE_AVX512BITALG, FEATURE_AVX512BF16, FEATURE_AVX512VP2INTERSECT, + + FEATURE_CMPXCHG16B = 46, + FEATURE_F16C = 49, + FEATURE_LAHF_LM = 54, + FEATURE_LM, + FEATURE_WP, + FEATURE_LZCNT, + FEATURE_MOVBE, + + FEATURE_X86_64_BASELINE = 95, + FEATURE_X86_64_V2, + FEATURE_X86_64_V3, + FEATURE_X86_64_V4, CPU_FEATURE_MAX }; @@ -677,6 +690,7 @@ unsigned *Features) { unsigned EAX = 0, EBX = 0; +#define hasFeature(F) ((Features[F / 32] >> (F % 32)) & 1) #define setFeature(F) \ Features[F / 32] |= 1U << (F % 32) @@ -697,14 +711,20 @@ setFeature(FEATURE_SSSE3); if ((ECX >> 12) & 1) setFeature(FEATURE_FMA); + if ((ECX >> 13) & 1) + setFeature(FEATURE_CMPXCHG16B); if ((ECX >> 19) & 1) setFeature(FEATURE_SSE4_1); if ((ECX >> 20) & 1) setFeature(FEATURE_SSE4_2); + if ((ECX >> 22) & 1) + setFeature(FEATURE_MOVBE); if ((ECX >> 23) & 1) setFeature(FEATURE_POPCNT); if ((ECX >> 25) & 1) setFeature(FEATURE_AES); + if ((ECX >> 29) & 1) + setFeature(FEATURE_F16C); // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV // indicates that the AVX registers will be saved and restored on context @@ -786,12 +806,39 @@ bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 && !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); - if (HasExtLeaf1 && ((ECX >> 6) & 1)) - setFeature(FEATURE_SSE4_A); - if (HasExtLeaf1 && ((ECX >> 11) & 1)) - setFeature(FEATURE_XOP); - if (HasExtLeaf1 && ((ECX >> 16) & 1)) - setFeature(FEATURE_FMA4); + if (HasExtLeaf1) { + if (ECX & 1) + setFeature(FEATURE_LAHF_LM); + if ((ECX >> 5) & 1) + setFeature(FEATURE_LZCNT); + if (((ECX >> 6) & 1)) + setFeature(FEATURE_SSE4_A); + if (((ECX >> 11) & 1)) + setFeature(FEATURE_XOP); + if (((ECX >> 16) & 1)) + setFeature(FEATURE_FMA4); + if (((EDX >> 29) & 1)) + setFeature(FEATURE_LM); + } + + if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) { + setFeature(FEATURE_X86_64_BASELINE); + if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) && + hasFeature(FEATURE_LAHF_LM) && hasFeature(FEATURE_SSE4_2)) { + setFeature(FEATURE_X86_64_V2); + if (hasFeature(FEATURE_AVX2) && hasFeature(FEATURE_BMI) && + hasFeature(FEATURE_BMI2) && hasFeature(FEATURE_F16C) && + hasFeature(FEATURE_FMA) && hasFeature(FEATURE_LZCNT) && + hasFeature(FEATURE_MOVBE)) { + setFeature(FEATURE_X86_64_V3); + if (hasFeature(FEATURE_AVX512BW) && hasFeature(FEATURE_AVX512CD) && + hasFeature(FEATURE_AVX512DQ) && hasFeature(FEATURE_AVX512VL)) + setFeature(FEATURE_X86_64_V4); + } + } + } + +#undef hasFeature #undef setFeature } @@ -813,7 +860,7 @@ #ifndef _WIN32 __attribute__((visibility("hidden"))) #endif -unsigned int __cpu_features2 = 0; +unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32]; // A constructor function that is sets __cpu_model and __cpu_features2 with // the right values. This needs to run only once. This constructor is @@ -827,6 +874,8 @@ unsigned Vendor; unsigned Model, Family; unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; + static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); + static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); // This function needs to run just once. if (__cpu_model.__cpu_vendor) @@ -844,9 +893,10 @@ // Find available features. getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]); - assert((sizeof(Features)/sizeof(Features[0])) == 2); __cpu_model.__cpu_features[0] = Features[0]; - __cpu_features2 = Features[1]; + __cpu_features2[0] = Features[1]; + __cpu_features2[1] = Features[2]; + __cpu_features2[2] = Features[3]; if (Vendor == SIG_INTEL) { // Get CPU type. diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -237,6 +237,7 @@ // listed here before, which means it doesn't support -march, -mtune and so on. // FIXME: Remove OnlyForCPUDispatchSpecific after all CPUs here support both // cpu_dispatch/specific() feature and -march, -mtune, and so on. +// clang-format off constexpr ProcInfo Processors[] = { // Empty processor. Include X87 and CMPXCHG8 for backwards compatibility. { {""}, CK_None, ~0U, FeatureX87 | FeatureCMPXCHG8B, '\0', false }, @@ -404,13 +405,14 @@ { {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3, '\0', false }, { {"znver4"}, CK_ZNVER4, FEATURE_AVX512VBMI2, FeaturesZNVER4, '\0', false }, // Generic 64-bit processor. - { {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64, '\0', false }, - { {"x86-64-v2"}, CK_x86_64_v2, ~0U, FeaturesX86_64_V2, '\0', false }, - { {"x86-64-v3"}, CK_x86_64_v3, ~0U, FeaturesX86_64_V3, '\0', false }, - { {"x86-64-v4"}, CK_x86_64_v4, ~0U, FeaturesX86_64_V4, '\0', false }, + { {"x86-64"}, CK_x86_64, FEATURE_SSE2 , FeaturesX86_64, '\0', false }, + { {"x86-64-v2"}, CK_x86_64_v2, FEATURE_SSE4_2 , FeaturesX86_64_V2, '\0', false }, + { {"x86-64-v3"}, CK_x86_64_v3, FEATURE_AVX2, FeaturesX86_64_V3, '\0', false }, + { {"x86-64-v4"}, CK_x86_64_v4, FEATURE_AVX512VL, FeaturesX86_64_V4, '\0', false }, // Geode processors. { {"geode"}, CK_Geode, ~0U, FeaturesGeode, '\0', false }, }; +// clang-format on constexpr const char *NoTuneList[] = {"x86-64-v2", "x86-64-v3", "x86-64-v4"};