This is an archive of the discontinued LLVM Phabricator instance.

[i386] Fix bug that get m128/m256/__m512 with wrong alignment for variadic functions.
Needs ReviewPublic

Authored by LiuChen3 on Apr 20 2020, 7:54 PM.

Download Raw Diff

Details

Reviewers

craig.topper
LuoYuanke
hjl.tools
rnk
rjmccall
wxiao3
jyknight
RKSimon

Summary

Currently clang aligns to 16 bytes when passing m128/m256/__m512 vector type.
However, when calculating va_arg, it will be always treated as 4 byte alignment, including
struct, union and vector types. For struct/union, there is no probem because it will align
to 4 bytes when passing them. For m128/m256/__m512 vector type, it will get wrong result.

This patch will get va_arg according the rules below:

When the target doesn't support avx and avx512: get m128/m256/__m512 from 16 bytes aligned stack.
When the target supports avx: get m256/m512 from 32 bytes aligned stack.
When the target supports avx512: get __m512 from 64 bytes aligned stack.

Diff Detail

Event Timeline

LiuChen3 created this revision.Apr 20 2020, 7:54 PM

Notice:
The current behavior of clang is inconsistent with i386 abi. The i386-abi says:

If parameters of type __m256 are required to be passed on the stack, the stack pointer must be aligned on a 0 mod 32 byte boundary at the time of the call.
If parameters of type __m512 are required to be passed on the stack, the stack pointer must be aligned on a 0 mod 64 byte boundary at the time of the call.

I think it's better to modify calling convention of the clang.

CCing cfe-commits

I uploaded a new patch D78564 as another solution, but it modified the current clang calling convention.

RKSimon resigned from this revision.Jun 18 2020, 2:11 AM

Revision Contents

Path

Size

clang/

lib/

Basic/

Targets/

X86.h

10 lines

CodeGen/

TargetInfo.cpp

74 lines

test/

CodeGen/

x86_32-align-linux-avx2.c

46 lines

x86_32-align-linux-avx512f.c

18 lines

x86_32-align-linux.c

46 lines

Diff 258888

clang/lib/Basic/Targets/X86.h

Show First 20 Lines • Show All 296 Lines • ▼ Show 20 Lines	public:
bool isValidFeatureName(StringRef Name) const override;		bool isValidFeatureName(StringRef Name) const override;

bool hasFeature(StringRef Feature) const override;		bool hasFeature(StringRef Feature) const override;

bool handleTargetFeatures(std::vector<std::string> &Features,		bool handleTargetFeatures(std::vector<std::string> &Features,
DiagnosticsEngine &Diags) override;		DiagnosticsEngine &Diags) override;

StringRef getABI() const override {		StringRef getABI() const override {
if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX512F)		llvm::Triple::ArchType Arch = getTriple().getArch();
		if ((Arch == llvm::Triple::x86_64 \|\| Arch == llvm::Triple::x86) &&
		SSELevel >= AVX512F)
return "avx512";		return "avx512";
if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX)		if ((Arch == llvm::Triple::x86_64 \|\| Arch == llvm::Triple::x86) &&
		SSELevel >= AVX)
return "avx";		return "avx";
if (getTriple().getArch() == llvm::Triple::x86 &&		if (Arch == llvm::Triple::x86 && MMX3DNowLevel == NoMMX3DNow)
MMX3DNowLevel == NoMMX3DNow)
return "no-mmx";		return "no-mmx";
return "";		return "";
}		}

bool isValidCPUName(StringRef Name) const override {		bool isValidCPUName(StringRef Name) const override {
return checkCPUKind(getCPUKind(Name));		return checkCPUKind(getCPUKind(Name));
}		}

▲ Show 20 Lines • Show All 584 Lines • Show Last 20 Lines

clang/lib/CodeGen/TargetInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,017 Lines • ▼ Show 20 Lines	static ABIArgInfo getDirectX86Hva(llvm::Type* T = nullptr) {
AI.setCanBeFlattened(false);		AI.setCanBeFlattened(false);
return AI;		return AI;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// X86-32 ABI Implementation		// X86-32 ABI Implementation
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

		/// The AVX ABI level for X86 targets.
		enum class X86AVXABILevel { None, AVX, AVX512 };

/// Similar to llvm::CCState, but for Clang.		/// Similar to llvm::CCState, but for Clang.
struct CCState {		struct CCState {
CCState(CGFunctionInfo &FI)		CCState(CGFunctionInfo &FI)
: IsPreassigned(FI.arg_size()), CC(FI.getCallingConvention()) {}		: IsPreassigned(FI.arg_size()), CC(FI.getCallingConvention()) {}

llvm::SmallBitVector IsPreassigned;		llvm::SmallBitVector IsPreassigned;
unsigned CC = CallingConv::CC_C;		unsigned CC = CallingConv::CC_C;
unsigned FreeRegs = 0;		unsigned FreeRegs = 0;
Show All 14 Lines	class X86_32ABIInfo : public SwiftABIInfo {

static const unsigned MinABIStackAlignInBytes = 4;		static const unsigned MinABIStackAlignInBytes = 4;

bool IsDarwinVectorABI;		bool IsDarwinVectorABI;
bool IsRetSmallStructInRegABI;		bool IsRetSmallStructInRegABI;
bool IsWin32StructABI;		bool IsWin32StructABI;
bool IsSoftFloatABI;		bool IsSoftFloatABI;
bool IsMCUABI;		bool IsMCUABI;
		bool IsLinuxABI;
unsigned DefaultNumRegisterParameters;		unsigned DefaultNumRegisterParameters;
		X86AVXABILevel AVXLevel;

static bool isRegisterSize(unsigned Size) {		static bool isRegisterSize(unsigned Size) {
return (Size == 8 \|\| Size == 16 \|\| Size == 32 \|\| Size == 64);		return (Size == 8 \|\| Size == 16 \|\| Size == 32 \|\| Size == 64);
}		}

bool isHomogeneousAggregateBaseType(QualType Ty) const override {		bool isHomogeneousAggregateBaseType(QualType Ty) const override {
// FIXME: Assumes vectorcall is in use.		// FIXME: Assumes vectorcall is in use.
return isX86VectorTypeForVectorCall(getContext(), Ty);		return isX86VectorTypeForVectorCall(getContext(), Ty);
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
public:		public:

void computeInfo(CGFunctionInfo &FI) const override;		void computeInfo(CGFunctionInfo &FI) const override;
Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,		Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
QualType Ty) const override;		QualType Ty) const override;

X86_32ABIInfo(CodeGen::CodeGenTypes &CGT, bool DarwinVectorABI,		X86_32ABIInfo(CodeGen::CodeGenTypes &CGT, bool DarwinVectorABI,
bool RetSmallStructInRegABI, bool Win32StructABI,		bool RetSmallStructInRegABI, bool Win32StructABI,
unsigned NumRegisterParameters, bool SoftFloatABI)		unsigned NumRegisterParameters, bool SoftFloatABI,
		X86AVXABILevel AVXLevel)
: SwiftABIInfo(CGT), IsDarwinVectorABI(DarwinVectorABI),		: SwiftABIInfo(CGT), IsDarwinVectorABI(DarwinVectorABI),
IsRetSmallStructInRegABI(RetSmallStructInRegABI),		IsRetSmallStructInRegABI(RetSmallStructInRegABI),
IsWin32StructABI(Win32StructABI),		IsWin32StructABI(Win32StructABI), IsSoftFloatABI(SoftFloatABI),
IsSoftFloatABI(SoftFloatABI),
IsMCUABI(CGT.getTarget().getTriple().isOSIAMCU()),		IsMCUABI(CGT.getTarget().getTriple().isOSIAMCU()),
DefaultNumRegisterParameters(NumRegisterParameters) {}		IsLinuxABI(CGT.getTarget().getTriple().isOSLinux()),
		DefaultNumRegisterParameters(NumRegisterParameters),
		AVXLevel(AVXLevel) {}

bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,		bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
bool asReturnValue) const override {		bool asReturnValue) const override {
// LLVM's x86-32 lowering currently only assigns up to three		// LLVM's x86-32 lowering currently only assigns up to three
// integer registers and three fp registers. Oddly, it'll use up to		// integer registers and three fp registers. Oddly, it'll use up to
// four vector registers for vectors, but those can overlap with the		// four vector registers for vectors, but those can overlap with the
// scalar registers.		// scalar registers.
return occupiesMoreThan(CGT, scalars, /total/ 3);		return occupiesMoreThan(CGT, scalars, /total/ 3);
}		}

bool isSwiftErrorInRegister() const override {		bool isSwiftErrorInRegister() const override {
// x86-32 lowering does not support passing swifterror in a register.		// x86-32 lowering does not support passing swifterror in a register.
return false;		return false;
}		}
};		};

class X86_32TargetCodeGenInfo : public TargetCodeGenInfo {		class X86_32TargetCodeGenInfo : public TargetCodeGenInfo {
public:		public:
X86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool DarwinVectorABI,		X86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool DarwinVectorABI,
bool RetSmallStructInRegABI, bool Win32StructABI,		bool RetSmallStructInRegABI, bool Win32StructABI,
unsigned NumRegisterParameters, bool SoftFloatABI)		unsigned NumRegisterParameters, bool SoftFloatABI,
		X86AVXABILevel AVXLevel)
: TargetCodeGenInfo(new X86_32ABIInfo(		: TargetCodeGenInfo(new X86_32ABIInfo(
CGT, DarwinVectorABI, RetSmallStructInRegABI, Win32StructABI,		CGT, DarwinVectorABI, RetSmallStructInRegABI, Win32StructABI,
NumRegisterParameters, SoftFloatABI)) {}		NumRegisterParameters, SoftFloatABI, AVXLevel)) {}

static bool isStructReturnInRegABI(		static bool isStructReturnInRegABI(
const llvm::Triple &Triple, const CodeGenOptions &Opts);		const llvm::Triple &Triple, const CodeGenOptions &Opts);

void setTargetAttributes(const Decl D, llvm::GlobalValue GV,		void setTargetAttributes(const Decl D, llvm::GlobalValue GV,
CodeGen::CodeGenModule &CGM) const override;		CodeGen::CodeGenModule &CGM) const override;

int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {		int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {
▲ Show 20 Lines • Show All 379 Lines • ▼ Show 20 Lines

unsigned X86_32ABIInfo::getTypeStackAlignInBytes(QualType Ty,		unsigned X86_32ABIInfo::getTypeStackAlignInBytes(QualType Ty,
unsigned Align) const {		unsigned Align) const {
// Otherwise, if the alignment is less than or equal to the minimum ABI		// Otherwise, if the alignment is less than or equal to the minimum ABI
// alignment, just use the default; the backend will handle this.		// alignment, just use the default; the backend will handle this.
if (Align <= MinABIStackAlignInBytes)		if (Align <= MinABIStackAlignInBytes)
return 0; // Use default alignment.		return 0; // Use default alignment.

		if (IsLinuxABI) {
		// Exclude other System V OS (e.g Darwin, PS4 and FreeBSD) since we don't
		// want to spend any effort dealing with the ramifications of ABI breaks.
		// If the target only supports doesn't support avx, return 16.
		// If the target supports avx or avx512, __m256 will align to 32 bytes.
		// __m512 will align to 64 bytes when the target supports avx512, align to
		// 32 bytes when the target supports avx and 16 for the other.
		if (Ty->getAs<VectorType>()) {
		int TypeSize = getContext().getTypeSize(Ty);
		if (TypeSize == 128)
		return Align;
		else if (TypeSize == 256)
		return (AVXLevel == X86AVXABILevel::AVX \|\|
		AVXLevel == X86AVXABILevel::AVX512)
		? Align
		: 16;
		else
		return AVXLevel == X86AVXABILevel::AVX512
		? Align
		: AVXLevel == X86AVXABILevel::AVX ? 32 : 16;
		} else
		return MinABIStackAlignInBytes;
		}
// On non-Darwin, the stack type alignment is always 4.		// On non-Darwin, the stack type alignment is always 4.
if (!IsDarwinVectorABI) {		if (!IsDarwinVectorABI) {
// Set explicit alignment, since we may need to realign the top.		// Set explicit alignment, since we may need to realign the top.
return MinABIStackAlignInBytes;		return MinABIStackAlignInBytes;
}		}

// Otherwise, if the type contains an SSE vector type, the alignment is 16.		// Otherwise, if the type contains an SSE vector type, the alignment is 16.
if (Align >= 16 && (isSSEVectorType(getContext(), Ty) \|\|		if (Align >= 16 && (isSSEVectorType(getContext(), Ty) \|\|
▲ Show 20 Lines • Show All 532 Lines • ▼ Show 20 Lines
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// X86-64 ABI Implementation		// X86-64 ABI Implementation
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//


namespace {		namespace {
/// The AVX ABI level for X86 targets.
enum class X86AVXABILevel {
None,
AVX,
AVX512
};

/// \p returns the size in bits of the largest (native) vector for \p AVXLevel.		/// \p returns the size in bits of the largest (native) vector for \p AVXLevel.
static unsigned getNativeVectorSizeForAVXABI(X86AVXABILevel AVXLevel) {		static unsigned getNativeVectorSizeForAVXABI(X86AVXABILevel AVXLevel) {
switch (AVXLevel) {		switch (AVXLevel) {
case X86AVXABILevel::AVX512:		case X86AVXABILevel::AVX512:
return 512;		return 512;
case X86AVXABILevel::AVX:		case X86AVXABILevel::AVX:
return 256;		return 256;
▲ Show 20 Lines • Show All 324 Lines • ▼ Show 20 Lines	static std::string qualifyWindowsLibrary(llvm::StringRef Lib) {
if (!Lib.endswith_lower(".lib") && !Lib.endswith_lower(".a"))		if (!Lib.endswith_lower(".lib") && !Lib.endswith_lower(".a"))
ArgStr += ".lib";		ArgStr += ".lib";
ArgStr += Quote ? "\"" : "";		ArgStr += Quote ? "\"" : "";
return ArgStr;		return ArgStr;
}		}

class WinX86_32TargetCodeGenInfo : public X86_32TargetCodeGenInfo {		class WinX86_32TargetCodeGenInfo : public X86_32TargetCodeGenInfo {
public:		public:
WinX86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT,		WinX86_32TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, bool DarwinVectorABI,
bool DarwinVectorABI, bool RetSmallStructInRegABI, bool Win32StructABI,		bool RetSmallStructInRegABI, bool Win32StructABI,
unsigned NumRegisterParameters)		unsigned NumRegisterParameters)
: X86_32TargetCodeGenInfo(CGT, DarwinVectorABI, RetSmallStructInRegABI,		: X86_32TargetCodeGenInfo(CGT, DarwinVectorABI, RetSmallStructInRegABI,
Win32StructABI, NumRegisterParameters, false) {}		Win32StructABI, NumRegisterParameters, false,
		X86AVXABILevel::None) {}

void setTargetAttributes(const Decl D, llvm::GlobalValue GV,		void setTargetAttributes(const Decl D, llvm::GlobalValue GV,
CodeGen::CodeGenModule &CGM) const override;		CodeGen::CodeGenModule &CGM) const override;

void getDependentLibraryOption(llvm::StringRef Lib,		void getDependentLibraryOption(llvm::StringRef Lib,
llvm::SmallString<24> &Opt) const override {		llvm::SmallString<24> &Opt) const override {
Opt = "/DEFAULTLIB:";		Opt = "/DEFAULTLIB:";
Opt += qualifyWindowsLibrary(Lib);		Opt += qualifyWindowsLibrary(Lib);
▲ Show 20 Lines • Show All 7,874 Lines • ▼ Show 20 Lines	case llvm::Triple::tcele:
return SetCGInfo(new TCETargetCodeGenInfo(Types));		return SetCGInfo(new TCETargetCodeGenInfo(Types));

case llvm::Triple::x86: {		case llvm::Triple::x86: {
bool IsDarwinVectorABI = Triple.isOSDarwin();		bool IsDarwinVectorABI = Triple.isOSDarwin();
bool RetSmallStructInRegABI =		bool RetSmallStructInRegABI =
X86_32TargetCodeGenInfo::isStructReturnInRegABI(Triple, CodeGenOpts);		X86_32TargetCodeGenInfo::isStructReturnInRegABI(Triple, CodeGenOpts);
bool IsWin32FloatStructABI = Triple.isOSWindows() && !Triple.isOSCygMing();		bool IsWin32FloatStructABI = Triple.isOSWindows() && !Triple.isOSCygMing();

		StringRef ABI = getTarget().getABI();
		X86AVXABILevel AVXLevel =
		(ABI == "avx512"
		? X86AVXABILevel::AVX512
		: ABI == "avx" ? X86AVXABILevel::AVX : X86AVXABILevel::None);

if (Triple.getOS() == llvm::Triple::Win32) {		if (Triple.getOS() == llvm::Triple::Win32) {
return SetCGInfo(new WinX86_32TargetCodeGenInfo(		return SetCGInfo(new WinX86_32TargetCodeGenInfo(
Types, IsDarwinVectorABI, RetSmallStructInRegABI,		Types, IsDarwinVectorABI, RetSmallStructInRegABI,
IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters));		IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters));
} else {		} else {
return SetCGInfo(new X86_32TargetCodeGenInfo(		return SetCGInfo(new X86_32TargetCodeGenInfo(
Types, IsDarwinVectorABI, RetSmallStructInRegABI,		Types, IsDarwinVectorABI, RetSmallStructInRegABI,
IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters,		IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters,
CodeGenOpts.FloatABI == "soft"));		CodeGenOpts.FloatABI == "soft", AVXLevel));
}		}
}		}

case llvm::Triple::x86_64: {		case llvm::Triple::x86_64: {
StringRef ABI = getTarget().getABI();		StringRef ABI = getTarget().getABI();
X86AVXABILevel AVXLevel =		X86AVXABILevel AVXLevel =
(ABI == "avx512"		(ABI == "avx512"
? X86AVXABILevel::AVX512		? X86AVXABILevel::AVX512
▲ Show 20 Lines • Show All 135 Lines • Show Last 20 Lines

clang/test/CodeGen/x86_32-align-linux-avx2.c

This file was added.

				// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -target-feature +avx -emit-llvm -o %t %s
				// RUN: FileCheck < %t %s

				#include <immintrin.h>

				// CHECK-LABEL: define void @testm128
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 15
				// CHECK-NEXT: %2 = and i32 %1, -16
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm128(int argCount, ...) {
				__m128 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m128);
				__builtin_va_end(args);
				}

				// CHECK-LABEL: define void @testm256
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 31
				// CHECK-NEXT: %2 = and i32 %1, -32
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm256(int argCount, ...) {
				__m256 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m256);
				__builtin_va_end(args);
				}

				// CHECK-LABEL: define void @testm512
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 31
				// CHECK-NEXT: %2 = and i32 %1, -32
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm512(int argCount, ...) {
				__m512 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m512);
				__builtin_va_end(args);
				}

clang/test/CodeGen/x86_32-align-linux-avx512f.c

This file was added.

				// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -target-feature +avx512f -emit-llvm -o %t %s
				// RUN: FileCheck < %t %s

				#include <immintrin.h>

				// CHECK-LABEL: define void @testm512
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 63
				// CHECK-NEXT: %2 = and i32 %1, -64
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm512(int argCount, ...) {
				__m512 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m512);
				__builtin_va_end(args);
				}

clang/test/CodeGen/x86_32-align-linux.c

This file was added.

				// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -emit-llvm -o %t %s
				// RUN: FileCheck < %t %s

				#include <immintrin.h>

				// CHECK-LABEL: define void @testm128
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 15
				// CHECK-NEXT: %2 = and i32 %1, -16
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm128(int argCount, ...) {
				__m128 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m128);
				__builtin_va_end(args);
				}

				// CHECK-LABEL: define void @testm256
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 15
				// CHECK-NEXT: %2 = and i32 %1, -16
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm256(int argCount, ...) {
				__m256 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m256);
				__builtin_va_end(args);
				}

				// CHECK-LABEL: define void @testm512
				// CHECK-LABEL: %argp.cur = load i8, i8* %args, align 4
				// CHECK-NEXT: %0 = ptrtoint i8* %argp.cur to i32
				// CHECK-NEXT: %1 = add i32 %0, 15
				// CHECK-NEXT: %2 = and i32 %1, -16
				// CHECK-NEXT: %argp.cur.aligned = inttoptr i32 %2 to i8*
				void testm512(int argCount, ...) {
				__m512 res;
				__builtin_va_list args;
				__builtin_va_start(args, argCount);
				res = __builtin_va_arg(args, __m512);
				__builtin_va_end(args);
				}