diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -264,6 +264,9 @@ X86 Support ^^^^^^^^^^^ +- Added option ``-m[no-]evex512`` to disable ZMM and 64-bit mask instructions + for AVX512 features. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5733,6 +5733,8 @@ def mno_cx16 : Flag<["-"], "mno-cx16">, Group; def menqcmd : Flag<["-"], "menqcmd">, Group; def mno_enqcmd : Flag<["-"], "mno-enqcmd">, Group; +def mevex512 : Flag<["-"], "mevex512">, Group; +def mno_evex512 : Flag<["-"], "mno-evex512">, Group; def mf16c : Flag<["-"], "mf16c">, Group; def mno_f16c : Flag<["-"], "mno-f16c">, Group; def mfma : Flag<["-"], "mfma">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -95,6 +95,7 @@ bool HasLWP = false; bool HasFMA = false; bool HasF16C = false; + bool HasEVEX512 = false; bool HasAVX512CD = false; bool HasAVX512VPOPCNTDQ = false; bool HasAVX512VNNI = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -119,6 +119,7 @@ setFeatureEnabled(Features, F, true); std::vector UpdatedFeaturesVec; + bool HasEVEX512 = false; for (const auto &Feature : FeaturesVec) { // Expand general-regs-only to -x86, -mmx and -sse if (Feature == "+general-regs-only") { @@ -128,8 +129,17 @@ continue; } + // AVX512F will enable EVEX512. + if (!HasEVEX512 && Feature.substr(0, 7) == "+avx512") + HasEVEX512 = true; + // Disable AVX512F will disable EVEX512. + if (HasEVEX512 && (Feature == "-avx512f" || Feature == "-evex512")) + HasEVEX512 = false; + UpdatedFeaturesVec.push_back(Feature); } + if (HasEVEX512) + UpdatedFeaturesVec.push_back("+evex512"); if (!TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec)) return false; @@ -228,6 +238,8 @@ HasF16C = true; } else if (Feature == "+gfni") { HasGFNI = true; + } else if (Feature == "+evex512") { + HasEVEX512 = true; } else if (Feature == "+avx512cd") { HasAVX512CD = true; } else if (Feature == "+avx512vpopcntdq") { @@ -731,6 +743,8 @@ if (HasGFNI) Builder.defineMacro("__GFNI__"); + if (HasEVEX512) + Builder.defineMacro("__EVEX512__"); if (HasAVX512CD) Builder.defineMacro("__AVX512CD__"); if (HasAVX512VPOPCNTDQ) @@ -986,6 +1000,7 @@ .Case("crc32", true) .Case("cx16", true) .Case("enqcmd", true) + .Case("evex512", true) .Case("f16c", true) .Case("fma", true) .Case("fma4", true) @@ -1093,6 +1108,7 @@ .Case("cx8", HasCX8) .Case("cx16", HasCX16) .Case("enqcmd", HasENQCMD) + .Case("evex512", HasEVEX512) .Case("f16c", HasF16C) .Case("fma", HasFMA) .Case("fma4", XOPLevel >= FMA4) @@ -1533,8 +1549,9 @@ return Size <= 64; case 'z': // XMM0/YMM/ZMM0 - if (hasFeatureEnabled(FeatureMap, "avx512f")) - // ZMM0 can be used if target supports AVX512F. + if (hasFeatureEnabled(FeatureMap, "avx512f") && + hasFeatureEnabled(FeatureMap, "evex512")) + // ZMM0 can be used if target supports AVX512F and EVEX512 is set. return Size <= 512U; else if (hasFeatureEnabled(FeatureMap, "avx")) // YMM0 can be used if target supports AVX. @@ -1553,8 +1570,10 @@ break; case 'v': case 'x': - if (hasFeatureEnabled(FeatureMap, "avx512f")) - // 512-bit zmm registers can be used if target supports AVX512F. + if (hasFeatureEnabled(FeatureMap, "avx512f") && + hasFeatureEnabled(FeatureMap, "evex512")) + // 512-bit zmm registers can be used if target supports AVX512F and + // EVEX512 is set. return Size <= 512U; else if (hasFeatureEnabled(FeatureMap, "avx")) // 256-bit ymm registers can be used if target supports AVX. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5471,8 +5471,11 @@ // can move this up to the beginning of the function. checkTargetFeatures(E, FD); - if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID)) + if (unsigned VectorWidth = + getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID)) { + checkTargetVectorWidth(E, FD, VectorWidth); LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth); + } // See if we have a target specific intrinsic. StringRef Name = getContext().BuiltinInfo.getName(BuiltinID); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4067,6 +4067,8 @@ void checkTargetFeatures(const CallExpr *E, const FunctionDecl *TargetDecl); void checkTargetFeatures(SourceLocation Loc, const FunctionDecl *TargetDecl); + void checkTargetVectorWidth(const CallExpr *E, const FunctionDecl *TargetDecl, + unsigned VectorWidth); llvm::CallInst *EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name = ""); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2573,6 +2573,20 @@ return checkTargetFeatures(E->getBeginLoc(), TargetDecl); } +// Emits an error if the builtin's vector width >= 512 and evex512 feature is +// not set. +void CodeGenFunction::checkTargetVectorWidth(const CallExpr *E, + const FunctionDecl *TargetDecl, + unsigned VectorWidth) { + if (!getTarget().getTriple().isX86() || VectorWidth < 512) + return; + llvm::StringMap FeatureMap; + CGM.getContext().getFunctionFeatureMap(FeatureMap, TargetDecl); + if (FeatureMap.lookup("avx512f") && !FeatureMap.lookup("evex512")) + CGM.getDiags().Report(E->getBeginLoc(), diag::err_builtin_needs_feature) + << TargetDecl->getDeclName() << "evex512"; +} + // Emits an error if we don't have a valid set of target features for the // called function. void CodeGenFunction::checkTargetFeatures(SourceLocation Loc, diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -1508,6 +1508,24 @@ return false; } +static bool checkAVX512ParamFeature(DiagnosticsEngine &Diag, + SourceLocation CallLoc, + const llvm::StringMap &CallerMap, + const llvm::StringMap &CalleeMap, + QualType Ty, bool IsArgument) { + bool Caller256 = CallerMap.lookup("avx512f") && !CallerMap.lookup("evex512"); + bool Callee256 = CallerMap.lookup("avx512f") && !CallerMap.lookup("evex512"); + + // Forbid 512-bit or larger vector pass or return when we disabled ZMM + // instructions. + if (Caller256 || Callee256) + return Diag.Report(CallLoc, diag::err_avx_calling_convention) + << IsArgument << Ty << "evex512"; + + return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty, + "avx512f", IsArgument); +} + static bool checkAVXParam(DiagnosticsEngine &Diag, ASTContext &Ctx, SourceLocation CallLoc, const llvm::StringMap &CallerMap, @@ -1515,8 +1533,8 @@ bool IsArgument) { uint64_t Size = Ctx.getTypeSize(Ty); if (Size > 256) - return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty, - "avx512f", IsArgument); + return checkAVX512ParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty, + IsArgument); if (Size > 128) return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty, "avx", diff --git a/clang/test/CodeGen/X86/avx512-error.c b/clang/test/CodeGen/X86/avx512-error.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512-error.c @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512dq -target-feature -evex512 -emit-llvm -verify + +#include + +__m512d test_mm512_sqrt_pd(__m512d a) +{ + // CHECK-LABEL: @test_mm512_sqrt_pd + return __builtin_ia32_sqrtpd512(a, _MM_FROUND_CUR_DIRECTION); // expected-error {{'__builtin_ia32_sqrtpd512' needs target feature evex512}} +} diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c --- a/clang/test/CodeGen/attr-cpuspecific.c +++ b/clang/test/CodeGen/attr-cpuspecific.c @@ -353,7 +353,7 @@ // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-SAME: "tune-cpu"="ivybridge" -// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" +// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-SAME: "tune-cpu"="knl" // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" // CHECK-SAME: "tune-cpu"="atom" diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -69,4 +69,4 @@ // CHECK: "target-cpu"="x86-64-v3" // CHECK-SAME: "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" // CHECK: "target-cpu"="x86-64-v4" -// CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" +// CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" diff --git a/clang/test/CodeGen/regcall2.c b/clang/test/CodeGen/regcall2.c --- a/clang/test/CodeGen/regcall2.c +++ b/clang/test/CodeGen/regcall2.c @@ -21,8 +21,8 @@ // FIXME: Do we need to change for Windows? // Win: define dso_local x86_regcallcc void @__regcall3__foo(ptr noalias sret(%struct.__sVector) align 64 %agg.result, i32 noundef %a) #0 // Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr noundef %a) #0 -// Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" } +// Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" } // Lin: define dso_local x86_regcallcc %struct.__sVector @__regcall3__foo(i32 noundef %a) #0 // Lin: define dso_local x86_regcallcc double @__regcall3__bar([4 x <8 x double>] %a.coerce0, [4 x <16 x float>] %a.coerce1) #0 -// Lin: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="512" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" } +// Lin: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="512" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" } diff --git a/clang/test/CodeGen/target-avx-abi-diag.c b/clang/test/CodeGen/target-avx-abi-diag.c --- a/clang/test/CodeGen/target-avx-abi-diag.c +++ b/clang/test/CodeGen/target-avx-abi-diag.c @@ -1,6 +1,8 @@ // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -verify=no256,no512 -o - -S // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx -verify=no512 -o - -S // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -verify=both -o - -S +// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature +evex512 -verify=both -o - -S +// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -DAVX512_256 -verify=avx512-256 -o - -S // REQUIRES: x86-registered-target // both-no-diagnostics @@ -16,6 +18,7 @@ void variadic(int i, ...); __attribute__((target("avx512f"))) void variadic_err(int i, ...); +#ifndef AVX512_256 // If neither side has an attribute, warn. void call_warn(void) { avx256Type t1; @@ -27,15 +30,18 @@ variadic(1, t1); // no256-warning {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}} variadic(3, t2); // no512-warning {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}} } +#endif // If only 1 side has an attribute, error. void call_errors(void) { avx256Type t1; takesAvx256(t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}} avx512fType t2; + // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}} takesAvx512(t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}} variadic_err(1, t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}} + // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}} variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}} } diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -369,6 +369,11 @@ // AVXVNNIINT16: "-target-feature" "+avxvnniint16" // NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16" +// RUN: %clang --target=i386 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EVEX512 %s +// RUN: %clang --target=i386 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-EVEX512 %s +// EVEX512: "-target-feature" "+evex512" +// NO-EVEX512: "-target-feature" "-evex512" + // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s // CRC32: "-target-feature" "+crc32" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -64,6 +64,7 @@ // AVX512F: #define __AVX2__ 1 // AVX512F: #define __AVX512F__ 1 // AVX512F: #define __AVX__ 1 +// AVX512F: #define __EVEX512__ 1 // AVX512F: #define __SSE2_MATH__ 1 // AVX512F: #define __SSE2__ 1 // AVX512F: #define __SSE3__ 1 @@ -79,6 +80,7 @@ // AVX512CD: #define __AVX512CD__ 1 // AVX512CD: #define __AVX512F__ 1 // AVX512CD: #define __AVX__ 1 +// AVX512CD: #define __EVEX512__ 1 // AVX512CD: #define __SSE2_MATH__ 1 // AVX512CD: #define __SSE2__ 1 // AVX512CD: #define __SSE3__ 1 @@ -94,6 +96,7 @@ // AVX512ER: #define __AVX512ER__ 1 // AVX512ER: #define __AVX512F__ 1 // AVX512ER: #define __AVX__ 1 +// AVX512ER: #define __EVEX512__ 1 // AVX512ER: #define __SSE2_MATH__ 1 // AVX512ER: #define __SSE2__ 1 // AVX512ER: #define __SSE3__ 1 @@ -109,6 +112,7 @@ // AVX512PF: #define __AVX512F__ 1 // AVX512PF: #define __AVX512PF__ 1 // AVX512PF: #define __AVX__ 1 +// AVX512PF: #define __EVEX512__ 1 // AVX512PF: #define __SSE2_MATH__ 1 // AVX512PF: #define __SSE2__ 1 // AVX512PF: #define __SSE3__ 1 @@ -124,6 +128,7 @@ // AVX512DQ: #define __AVX512DQ__ 1 // AVX512DQ: #define __AVX512F__ 1 // AVX512DQ: #define __AVX__ 1 +// AVX512DQ: #define __EVEX512__ 1 // AVX512DQ: #define __SSE2_MATH__ 1 // AVX512DQ: #define __SSE2__ 1 // AVX512DQ: #define __SSE3__ 1 @@ -139,6 +144,7 @@ // AVX512BW: #define __AVX512BW__ 1 // AVX512BW: #define __AVX512F__ 1 // AVX512BW: #define __AVX__ 1 +// AVX512BW: #define __EVEX512__ 1 // AVX512BW: #define __SSE2_MATH__ 1 // AVX512BW: #define __SSE2__ 1 // AVX512BW: #define __SSE3__ 1 @@ -154,6 +160,7 @@ // AVX512VL: #define __AVX512F__ 1 // AVX512VL: #define __AVX512VL__ 1 // AVX512VL: #define __AVX__ 1 +// AVX512VL: #define __EVEX512__ 1 // AVX512VL: #define __SSE2_MATH__ 1 // AVX512VL: #define __SSE2__ 1 // AVX512VL: #define __SSE3__ 1 @@ -168,6 +175,7 @@ // AVX512F2: #define __AVX2__ 1 // AVX512F2-NOT: #define __AVX512F__ 1 // AVX512F2-NOT: #define __AVX512PF__ 1 +// AVX512F2-NOT: #define __EVEX512__ 1 // AVX512F2: #define __AVX__ 1 // AVX512F2: #define __SSE2_MATH__ 1 // AVX512F2: #define __SSE2__ 1 @@ -184,6 +192,7 @@ // AVX512IFMA: #define __AVX512F__ 1 // AVX512IFMA: #define __AVX512IFMA__ 1 // AVX512IFMA: #define __AVX__ 1 +// AVX512IFMA: #define __EVEX512__ 1 // AVX512IFMA: #define __SSE2_MATH__ 1 // AVX512IFMA: #define __SSE2__ 1 // AVX512IFMA: #define __SSE3__ 1 @@ -200,6 +209,7 @@ // AVX512VBMI: #define __AVX512F__ 1 // AVX512VBMI: #define __AVX512VBMI__ 1 // AVX512VBMI: #define __AVX__ 1 +// AVX512VBMI: #define __EVEX512__ 1 // AVX512VBMI: #define __SSE2_MATH__ 1 // AVX512VBMI: #define __SSE2__ 1 // AVX512VBMI: #define __SSE3__ 1 @@ -216,6 +226,7 @@ // AVX512BITALG: #define __AVX512BW__ 1 // AVX512BITALG: #define __AVX512F__ 1 // AVX512BITALG: #define __AVX__ 1 +// AVX512BITALG: #define __EVEX512__ 1 // AVX512BITALG: #define __SSE2_MATH__ 1 // AVX512BITALG: #define __SSE2__ 1 // AVX512BITALG: #define __SSE3__ 1 @@ -230,6 +241,7 @@ // AVX512VBMINOAVX512BW-NOT: #define __AVX512BW__ 1 // AVX512VBMINOAVX512BW-NOT: #define __AVX512VBMI__ 1 +// AVX512VBMINOAVX512BW: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vbmi2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VBMI2 %s @@ -238,6 +250,7 @@ // AVX512VBMI2: #define __AVX512F__ 1 // AVX512VBMI2: #define __AVX512VBMI2__ 1 // AVX512VBMI2: #define __AVX__ 1 +// AVX512VBMI2: #define __EVEX512__ 1 // AVX512VBMI2: #define __SSE2_MATH__ 1 // AVX512VBMI2: #define __SSE2__ 1 // AVX512VBMI2: #define __SSE3__ 1 @@ -251,11 +264,13 @@ // AVX512VBMI2NOAVX512BW-NOT: #define __AVX512BW__ 1 // AVX512VBMI2NOAVX512BW-NOT: #define __AVX512VBMI2__ 1 +// AVX512VBMI2NOAVX512BW: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bitalg -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BITALGNOAVX512BW %s // AVX512BITALGNOAVX512BW-NOT: #define __AVX512BITALG__ 1 // AVX512BITALGNOAVX512BW-NOT: #define __AVX512BW__ 1 +// AVX512BITALGNOAVX512BW: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42POPCNT %s @@ -467,24 +482,29 @@ // AVX512BF16: #define __AVX512BF16__ 1 // AVX512BF16: #define __AVX512BW__ 1 // AVX512BF16-NOT: #define __AVX512VL__ 1 +// AVX512BF16: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bf16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BF16_NOAVX512BW %s // AVX512BF16_NOAVX512BW-NOT: #define __AVX512BF16__ 1 +// AVX512BF16_NOAVX512BW: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bf16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BF16_NOAVX512VL %s // AVX512BF16_NOAVX512VL: #define __AVX512BF16__ 1 +// AVX512BF16_NOAVX512VL: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512vp2intersect -x c -E -dM -o - %s | FileCheck -check-prefix=VP2INTERSECT %s // VP2INTERSECT: #define __AVX512F__ 1 // VP2INTERSECT: #define __AVX512VP2INTERSECT__ 1 +// VP2INTERSECT: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx512vp2intersect -x c -E -dM -o - %s | FileCheck -check-prefix=NOVP2INTERSECT %s // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512vp2intersect -mno-avx512f -x c -E -dM -o - %s | FileCheck -check-prefix=NOVP2INTERSECT %s // NOVP2INTERSECT-NOT: #define __AVX512VP2INTERSECT__ 1 +// NOVP2INTERSECT-NOT: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mkl -x c -E -dM -o - %s | FileCheck -check-prefix=KEYLOCKER %s @@ -591,21 +611,37 @@ // AVX512FP16: #define __AVX512DQ__ 1 // AVX512FP16: #define __AVX512FP16__ 1 // AVX512FP16: #define __AVX512VL__ 1 +// AVX512FP16: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s // AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1 // AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1 +// AVX512FP16NOAVX512VL: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s // AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1 // AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1 +// AVX512FP16NOAVX512BW: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s // AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1 // AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1 +// AVX512FP16NOAVX512DQ: #define __EVEX512__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s +// NOEVEX512-NOT: #define __AVX512F__ 1 +// NOEVEX512-NOT: #define __EVEX512__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s +// AVX512NOEVEX512: #define __AVX512F__ 1 +// AVX512NOEVEX512-NOT: #define __EVEX512__ 1 // RUN: %clang -target x86_64-unknown-linux-gnu -march=atom -mcmpccxadd -x c -E -dM -o - %s | FileCheck -check-prefix=CMPCCXADD %s diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -240,6 +240,7 @@ X86_FEATURE (SM3, "sm3") X86_FEATURE (SM4, "sm4") X86_FEATURE (AVXVNNIINT16, "avxvnniint16") +X86_FEATURE (EVEX512, "evex512") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2030,6 +2030,17 @@ "' does not apply to function return values", V); + unsigned MaxParameterWidth = 0; + auto GetMaxParameterWidth = [&MaxParameterWidth](Type *Ty) { + if (Ty->isVectorTy()) { + if (auto *VT = dyn_cast(Ty)) { + unsigned Size = VT->getPrimitiveSizeInBits().getFixedValue(); + if (Size > MaxParameterWidth) + MaxParameterWidth = Size; + } + } + }; + GetMaxParameterWidth(FT->getReturnType()); verifyParameterAttrs(RetAttrs, FT->getReturnType(), V); // Verify parameter attributes. @@ -2048,6 +2059,7 @@ } verifyParameterAttrs(ArgAttrs, Ty, V); + GetMaxParameterWidth(Ty); if (ArgAttrs.hasAttribute(Attribute::Nest)) { Check(!SawNest, "More than one parameter has attribute nest!", V); @@ -2203,6 +2215,16 @@ CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } + // Check EVEX512 feature. + if (MaxParameterWidth >= 512 && Attrs.hasFnAttr("target-features")) { + Triple T(M.getTargetTriple()); + if (T.isX86()) { + StringRef TF = Attrs.getFnAttr("target-features").getValueAsString(); + Check(!TF.contains("+avx512f") || !TF.contains("-evex512"), + "512-bit vector arguments require 'evex512' for AVX512", V); + } + } + checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-prefix", V); checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-entry", V); checkUnsignedBaseTenFuncAttr(Attrs, "warn-stack-size", V); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -285,6 +285,7 @@ SmallVectorImpl &CB) const; PrefixKind emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, + const MCSubtargetInfo &STI, SmallVectorImpl &CB) const; void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI, @@ -841,7 +842,7 @@ // REX prefix is optional, but if used must be immediately before the opcode // Encoding type for this instruction. return (TSFlags & X86II::EncodingMask) - ? emitVEXOpcodePrefix(MemoryOperand, MI, CB) + ? emitVEXOpcodePrefix(MemoryOperand, MI, STI, CB) : emitOpcodePrefix(MemoryOperand, MI, STI, CB); } @@ -860,6 +861,7 @@ /// \returns the used prefix. PrefixKind X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, + const MCSubtargetInfo &STI, SmallVectorImpl &CB) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); uint64_t TSFlags = Desc.TSFlags; @@ -919,6 +921,9 @@ Prefix.setL(TSFlags & X86II::VEX_L); Prefix.setL2(TSFlags & X86II::EVEX_L2); + if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) && + !STI.hasFeature(X86::FeatureEVEX512)) + report_fatal_error("ZMM registers are not supported without EVEX512"); switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: Prefix.setPP(0x1); // 66 diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -119,6 +119,8 @@ def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", "Support 16-bit floating point conversion instructions", [FeatureAVX]>; +def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true", + "Support ZMM and 64-bit mask instructions">; def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512", "Enable AVX-512 instructions", [FeatureAVX2, FeatureFMA, FeatureF16C]>; @@ -817,6 +819,7 @@ ]; list X86_64V4Features = !listconcat(X86_64V3Features, [ + FeatureEVEX512, FeatureBWI, FeatureCDI, FeatureDQI, @@ -940,6 +943,7 @@ FeatureXSAVES, FeatureCLFLUSHOPT, FeatureAVX512, + FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -982,6 +986,7 @@ // Cannonlake list CNLAdditionalFeatures = [FeatureAVX512, + FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -1262,6 +1267,7 @@ FeatureF16C, FeatureFSGSBase, FeatureAVX512, + FeatureEVEX512, FeatureERI, FeatureCDI, FeaturePFI, @@ -1471,6 +1477,7 @@ !listconcat(ZN2Features, ZN3AdditionalFeatures); list ZN4Tuning = ZN3Tuning; list ZN4AdditionalFeatures = [FeatureAVX512, + FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -903,6 +903,7 @@ def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasEVEX512 : Predicate<"Subtarget->hasEVEX512()">; def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1030,7 +1030,14 @@ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); - if (RC.getID() != X86::TILERegClassID) + unsigned ID = RC.getID(); + const X86Subtarget &Subtarget = MF.getSubtarget(); + if ((ID == X86::VK64RegClassID || ID == X86::VK64WMRegClassID) && + Subtarget.hasAVX512() && !Subtarget.hasEVEX512()) + report_fatal_error( + "64-bit mask registers are not supported without EVEX512"); + + if (ID != X86::TILERegClassID) return BaseImplRetVal; ShapeT VirtShape = getTileShape(VirtReg, const_cast(VRM), MRI); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -264,7 +264,8 @@ // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { - return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); + return hasAVX512() && hasEVEX512() && + (canExtendTo512DQ() || RequiredVectorWidth > 256); } bool useLight256BitInstructions() const { diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -268,6 +268,17 @@ if (!FS.empty()) FullFS = (Twine(FullFS) + "," + FS).str(); + // Attach EVEX512 feature when we have AVX512 features and EVEX512 is not set. + size_t posNoEVEX512 = FS.rfind("-evex512"); + size_t posEVEX512 = FS.rfind("+evex512"); + size_t posAVX512 = FS.rfind("+avx512"); + + if (posAVX512 != StringRef::npos) { + if ((posNoEVEX512 == StringRef::npos && posEVEX512 == StringRef::npos) || + (posNoEVEX512 != StringRef::npos && posAVX512 > posNoEVEX512)) + FullFS += ",+evex512"; + } + // Parse features string and set the CPU. ParseSubtargetFeatures(CPU, TuneCPU, FullFS); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -72,7 +72,7 @@ constexpr FeatureBitset FeaturesX86_64_V3 = FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C | FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE; -constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | +constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | FeatureEVEX512 | FeatureAVX512BW | FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512VL; @@ -96,8 +96,8 @@ // Intel Knights Landing and Knights Mill // Knights Landing has feature parity with Broadwell. constexpr FeatureBitset FeaturesKNL = - FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD | - FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1; + FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureEVEX512 | + FeatureAVX512CD | FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1; constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ; // Intel Skylake processors. @@ -107,9 +107,9 @@ // SkylakeServer inherits all SkylakeClient features except SGX. // FIXME: That doesn't match gcc. constexpr FeatureBitset FeaturesSkylakeServer = - (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD | - FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB | - FeaturePKU; + (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureEVEX512 | + FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | + FeatureCLWB | FeaturePKU; constexpr FeatureBitset FeaturesCascadeLake = FeaturesSkylakeServer | FeatureAVX512VNNI; constexpr FeatureBitset FeaturesCooperLake = @@ -117,9 +117,9 @@ // Intel 10nm processors. constexpr FeatureBitset FeaturesCannonlake = - FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ | - FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI | - FeaturePKU | FeatureSHA; + FeaturesSkylakeClient | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD | + FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | + FeatureAVX512VBMI | FeaturePKU | FeatureSHA; constexpr FeatureBitset FeaturesICLClient = FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureGFNI | FeatureRDPID | @@ -230,11 +230,11 @@ FeatureINVPCID | FeaturePKU | FeatureVAES | FeatureVPCLMULQDQ; static constexpr FeatureBitset FeaturesZNVER4 = - FeaturesZNVER3 | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ | - FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI | - FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512BITALG | - FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | FeatureGFNI | - FeatureSHSTK; + FeaturesZNVER3 | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD | + FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | + FeatureAVX512VBMI | FeatureAVX512VBMI2 | FeatureAVX512VNNI | + FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | + FeatureGFNI | FeatureSHSTK; // D151696 tranplanted Mangling and OnlyForCPUDispatchSpecific from // X86TargetParser.def to here. They are assigned by following ways: @@ -542,6 +542,7 @@ constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1; constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2; constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX; +constexpr FeatureBitset ImpliedFeaturesEVEX512 = {}; constexpr FeatureBitset ImpliedFeaturesAVX512F = FeatureAVX2 | FeatureF16C | FeatureFMA; diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s ; 256-bit diff --git a/llvm/test/CodeGen/X86/avx512vl-arith.ll b/llvm/test/CodeGen/X86/avx512vl-arith.ll --- a/llvm/test/CodeGen/X86/avx512vl-arith.ll +++ b/llvm/test/CodeGen/X86/avx512vl-arith.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl,-evex512 --show-mc-encoding| FileCheck %s ; 256-bit