diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h --- a/clang/include/clang/CodeGen/CGFunctionInfo.h +++ b/clang/include/clang/CodeGen/CGFunctionInfo.h @@ -88,6 +88,7 @@ Kind TheKind; bool PaddingInReg : 1; bool InAllocaSRet : 1; // isInAlloca() + bool InAllocaIndirect : 1;// isInAlloca() bool IndirectByVal : 1; // isIndirect() bool IndirectRealign : 1; // isIndirect() bool SRetAfterThis : 1; // isIndirect() @@ -110,8 +111,8 @@ public: ABIArgInfo(Kind K = Direct) - : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), - TheKind(K), PaddingInReg(false), InAllocaSRet(false), + : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K), + PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false), InReg(false), CanBeFlattened(false), SignExt(false) {} @@ -185,9 +186,10 @@ AI.setInReg(true); return AI; } - static ABIArgInfo getInAlloca(unsigned FieldIndex) { + static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) { auto AI = ABIArgInfo(InAlloca); AI.setInAllocaFieldIndex(FieldIndex); + AI.setInAllocaIndirect(Indirect); return AI; } static ABIArgInfo getExpand() { @@ -380,6 +382,15 @@ AllocaFieldIndex = FieldIndex; } + unsigned getInAllocaIndirect() const { + assert(isInAlloca() && "Invalid kind!"); + return InAllocaIndirect; + } + void setInAllocaIndirect(bool Indirect) { + assert(isInAlloca() && "Invalid kind!"); + InAllocaIndirect = Indirect; + } + /// Return true if this field of an inalloca struct should be returned /// to implement a struct return calling convention. bool getInAllocaSRet() const { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2339,6 +2339,9 @@ auto FieldIndex = ArgI.getInAllocaFieldIndex(); Address V = Builder.CreateStructGEP(ArgStruct, FieldIndex, Arg->getName()); + if (ArgI.getInAllocaIndirect()) + V = Address(Builder.CreateLoad(V), + getContext().getTypeAlignInChars(Ty)); ArgVals.push_back(ParamValue::forIndirect(V)); break; } @@ -4038,18 +4041,39 @@ assert(NumIRArgs == 0); assert(getTarget().getTriple().getArch() == llvm::Triple::x86); if (I->isAggregate()) { - // Replace the placeholder with the appropriate argument slot GEP. Address Addr = I->hasLValue() ? I->getKnownLValue().getAddress(*this) : I->getKnownRValue().getAggregateAddress(); llvm::Instruction *Placeholder = cast(Addr.getPointer()); - CGBuilderTy::InsertPoint IP = Builder.saveIP(); - Builder.SetInsertPoint(Placeholder); - Addr = - Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex()); - Builder.restoreIP(IP); + + if (!ArgInfo.getInAllocaIndirect()) { + // Replace the placeholder with the appropriate argument slot GEP. + CGBuilderTy::InsertPoint IP = Builder.saveIP(); + Builder.SetInsertPoint(Placeholder); + Addr = Builder.CreateStructGEP(ArgMemory, + ArgInfo.getInAllocaFieldIndex()); + Builder.restoreIP(IP); + } else { + // For indirect things such as overaligned structs, replace the + // placeholder with a regular aggregate temporary alloca. Store the + // address of this alloca into the struct. + Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp"); + Address ArgSlot = Builder.CreateStructGEP( + ArgMemory, ArgInfo.getInAllocaFieldIndex()); + Builder.CreateStore(Addr.getPointer(), ArgSlot); + } deferPlaceholderReplacement(Placeholder, Addr.getPointer()); + } else if (ArgInfo.getInAllocaIndirect()) { + // Make a temporary alloca and store the address of it into the argument + // struct. + Address Addr = CreateMemTempWithoutCast( + I->Ty, getContext().getTypeAlignInChars(I->Ty), + "indirect-arg-temp"); + I->copyInto(*this, Addr); + Address ArgSlot = + Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex()); + Builder.CreateStore(Addr.getPointer(), ArgSlot); } else { // Store the RValue into the argument struct. Address Addr = diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -1676,6 +1676,7 @@ bool IsVectorCall = State.CC == llvm::CallingConv::X86_VectorCall; Ty = useFirstFieldIfTransparentUnion(Ty); + TypeInfo TI = getContext().getTypeInfo(Ty); // Check with the C++ ABI first. const RecordType *RT = Ty->getAs(); @@ -1725,7 +1726,7 @@ bool NeedsPadding = false; bool InReg; if (shouldAggregateUseDirect(Ty, State, InReg, NeedsPadding)) { - unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32; + unsigned SizeInRegs = (TI.Width + 31) / 32; SmallVector Elements(SizeInRegs, Int32); llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements); if (InReg) @@ -1735,14 +1736,19 @@ } llvm::IntegerType *PaddingType = NeedsPadding ? Int32 : nullptr; + // Pass over-aligned aggregates on Windows indirectly. This behavior was + // added in MSVC 2015. + if (IsWin32StructABI && TI.AlignIsRequired && TI.Align > 32) + return getIndirectResult(Ty, /*ByVal=*/false, State); + // Expand small (<= 128-bit) record types when we know that the stack layout // of those arguments will match the struct. This is important because the // LLVM backend isn't smart enough to remove byval, which inhibits many // optimizations. // Don't do this for the MCU if there are still free integer registers // (see X86_64 ABI for full explanation). - if (getContext().getTypeSize(Ty) <= 4 * 32 && - (!IsMCUABI || State.FreeRegs == 0) && canExpandIndirectArgument(Ty)) + if (TI.Width <= 4 * 32 && (!IsMCUABI || State.FreeRegs == 0) && + canExpandIndirectArgument(Ty)) return ABIArgInfo::getExpandWithPadding( IsFastCall || IsVectorCall || IsRegCall, PaddingType); @@ -1750,14 +1756,24 @@ } if (const VectorType *VT = Ty->getAs()) { + // On Windows, vectors are passed directly if registers are available, or + // indirectly if not. This avoids the need to align argument memory. Pass + // user-defined vector types larger than 512 bits indirectly for simplicity. + if (IsWin32StructABI) { + if (TI.Width <= 512 && State.FreeSSERegs > 0) { + --State.FreeSSERegs; + return ABIArgInfo::getDirectInReg(); + } + return getIndirectResult(Ty, /*ByVal=*/false, State); + } + // On Darwin, some vectors are passed in memory, we handle this by passing // it as an i8/i16/i32/i64. if (IsDarwinVectorABI) { - uint64_t Size = getContext().getTypeSize(Ty); - if ((Size == 8 || Size == 16 || Size == 32) || - (Size == 64 && VT->getNumElements() == 1)) - return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), - Size)); + if ((TI.Width == 8 || TI.Width == 16 || TI.Width == 32) || + (TI.Width == 64 && VT->getNumElements() == 1)) + return ABIArgInfo::getDirect( + llvm::IntegerType::get(getVMContext(), TI.Width)); } if (IsX86_MMXType(CGT.ConvertType(Ty))) @@ -1787,9 +1803,10 @@ CCState State(FI); if (IsMCUABI) State.FreeRegs = 3; - else if (State.CC == llvm::CallingConv::X86_FastCall) + else if (State.CC == llvm::CallingConv::X86_FastCall) { State.FreeRegs = 2; - else if (State.CC == llvm::CallingConv::X86_VectorCall) { + State.FreeSSERegs = 3; + } else if (State.CC == llvm::CallingConv::X86_VectorCall) { State.FreeRegs = 2; State.FreeSSERegs = 6; } else if (FI.getHasRegParm()) @@ -1797,6 +1814,11 @@ else if (State.CC == llvm::CallingConv::X86_RegCall) { State.FreeRegs = 5; State.FreeSSERegs = 8; + } else if (IsWin32StructABI) { + // Since MSVC 2015, the first three SSE vectors have been passed in + // registers. The rest are passed indirectly. + State.FreeRegs = DefaultNumRegisterParameters; + State.FreeSSERegs = 3; } else State.FreeRegs = DefaultNumRegisterParameters; @@ -1843,16 +1865,25 @@ CharUnits &StackOffset, ABIArgInfo &Info, QualType Type) const { // Arguments are always 4-byte-aligned. - CharUnits FieldAlign = CharUnits::fromQuantity(4); + CharUnits WordSize = CharUnits::fromQuantity(4); + assert(StackOffset.isMultipleOf(WordSize) && "unaligned inalloca struct"); - assert(StackOffset.isMultipleOf(FieldAlign) && "unaligned inalloca struct"); - Info = ABIArgInfo::getInAlloca(FrameFields.size()); - FrameFields.push_back(CGT.ConvertTypeForMem(Type)); - StackOffset += getContext().getTypeSizeInChars(Type); + // sret pointers and indirect things will require an extra pointer + // indirection, unless they are byval. Most things are byval, and will not + // require this indirection. + bool IsIndirect = false; + if (Info.isIndirect() && !Info.getIndirectByVal()) + IsIndirect = true; + Info = ABIArgInfo::getInAlloca(FrameFields.size(), IsIndirect); + llvm::Type *LLTy = CGT.ConvertTypeForMem(Type); + if (IsIndirect) + LLTy = LLTy->getPointerTo(0); + FrameFields.push_back(LLTy); + StackOffset += IsIndirect ? WordSize : getContext().getTypeSizeInChars(Type); // Insert padding bytes to respect alignment. CharUnits FieldEnd = StackOffset; - StackOffset = FieldEnd.alignTo(FieldAlign); + StackOffset = FieldEnd.alignTo(WordSize); if (StackOffset != FieldEnd) { CharUnits NumBytes = StackOffset - FieldEnd; llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext()); @@ -1866,16 +1897,12 @@ switch (Info.getKind()) { case ABIArgInfo::InAlloca: return true; - case ABIArgInfo::Indirect: - assert(Info.getIndirectByVal()); - return true; case ABIArgInfo::Ignore: return false; + case ABIArgInfo::Indirect: case ABIArgInfo::Direct: case ABIArgInfo::Extend: - if (Info.getInReg()) - return false; - return true; + return !Info.getInReg(); case ABIArgInfo::Expand: case ABIArgInfo::CoerceAndExpand: // These are aggregate types which are never passed in registers when @@ -1909,8 +1936,7 @@ // Put the sret parameter into the inalloca struct if it's in memory. if (Ret.isIndirect() && !Ret.getInReg()) { - CanQualType PtrTy = getContext().getPointerType(FI.getReturnType()); - addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy); + addFieldToArgStruct(FrameFields, StackOffset, Ret, FI.getReturnType()); // On Windows, the hidden sret parameter is always returned in eax. Ret.setInAllocaSRet(IsWin32StructABI); } diff --git a/clang/test/CodeGen/x86_32-arguments-win32.c b/clang/test/CodeGen/x86_32-arguments-win32.c --- a/clang/test/CodeGen/x86_32-arguments-win32.c +++ b/clang/test/CodeGen/x86_32-arguments-win32.c @@ -46,3 +46,47 @@ struct s6 f6_1(void) { while (1) {} } void f6_2(struct s6 a0) {} + +// MSVC passes up to three vectors in registers, and the rest indirectly. We +// (arbitrarily) pass oversized vectors indirectly, since that is the safest way +// to do it. +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef float __m256 __attribute__((__vector_size__(32), __aligned__(32))); +typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); +typedef float __m1024 __attribute__((__vector_size__(128), __aligned__(128))); + +__m128 gv128; +__m256 gv256; +__m512 gv512; +__m1024 gv1024; + +void receive_vec_128(__m128 x, __m128 y, __m128 z, __m128 w, __m128 q) { + gv128 = x + y + z + w + q; +} +void receive_vec_256(__m256 x, __m256 y, __m256 z, __m256 w, __m256 q) { + gv256 = x + y + z + w + q; +} +void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) { + gv512 = x + y + z + w + q; +} +void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) { + gv1024 = x + y + z + w + q; +} +// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* %0, <4 x float>* %1) +// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, <8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1) +// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, <16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* %1) +// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4) + +void pass_vec_128() { + __m128 z = {0}; + receive_vec_128(z, z, z, z, z); +} + +// CHECK-LABEL: define dso_local void @pass_vec_128() +// CHECK: call void @receive_vec_128(<4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float>* %{{[^,)]*}}, <4 x float>* %{{[^,)]*}}) + + +void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) { + gv128 = x + y + z + w + q; +} +// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* inreg %0, i32 inreg %edx, <4 x float>* %1) diff --git a/clang/test/CodeGenCXX/inalloca-overaligned.cpp b/clang/test/CodeGenCXX/inalloca-overaligned.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCXX/inalloca-overaligned.cpp @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s + +// PR44395 +// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that +// works with inalloca. + +// FIXME: Pass non-trivial *and* overaligned types indirectly. Right now the C++ +// ABI rules say to use inalloca, and they take precedence, so it's not easy to +// implement this. + + +struct NonTrivial { + NonTrivial(); + NonTrivial(const NonTrivial &o); + int x; +}; + +struct __declspec(align(64)) OverAligned { + OverAligned(); + int buf[16]; +}; + +extern int gvi32; + +int receive_inalloca_overaligned(NonTrivial nt, OverAligned o) { + return nt.x + o.buf[0]; +} + +// CHECK-LABEL: define dso_local i32 @"?receive_inalloca_overaligned@@Y{{.*}}" +// CHECK-SAME: (<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %0) + +int pass_inalloca_overaligned() { + gvi32 = receive_inalloca_overaligned(NonTrivial(), OverAligned()); + return gvi32; +} + +// CHECK-LABEL: define dso_local i32 @"?pass_inalloca_overaligned@@Y{{.*}}" +// CHECK: [[TMP:%[^ ]*]] = alloca %struct.OverAligned, align 64 +// CHECK: call i8* @llvm.stacksave() +// CHECK: alloca inalloca <{ %struct.NonTrivial, %struct.OverAligned* }> + +// Construct OverAligned into TMP. +// CHECK: call x86_thiscallcc %struct.OverAligned* @"??0OverAligned@@QAE@XZ"(%struct.OverAligned* [[TMP]]) + +// Construct NonTrivial into the GEP. +// CHECK: [[GEP:%[^ ]*]] = getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 0 +// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* [[GEP]]) + +// Store the address of an OverAligned temporary into the struct. +// CHECK: getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 1 +// CHECK: store %struct.OverAligned* [[TMP]], %struct.OverAligned** %{{.*}}, align 4 +// CHECK: call i32 @"?receive_inalloca_overaligned@@Y{{.*}}"(<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %argmem) diff --git a/clang/test/CodeGenCXX/inalloca-vector.cpp b/clang/test/CodeGenCXX/inalloca-vector.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCXX/inalloca-vector.cpp @@ -0,0 +1,79 @@ +// RUN: %clang_cc1 -w -triple i686-pc-win32 -emit-llvm -o - %s | FileCheck %s + +// PR44395 +// MSVC passes up to three vectors in registers, and the rest indirectly. Check +// that both are compatible with an inalloca prototype. + +struct NonTrivial { + NonTrivial(); + NonTrivial(const NonTrivial &o); + unsigned handle; +}; + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +__m128 gv128; + +// nt, w, and q will be in the inalloca pack. +void receive_vec_128(NonTrivial nt, __m128 x, __m128 y, __m128 z, __m128 w, __m128 q) { + gv128 = x + y + z + w + q; +} +// CHECK-LABEL: define dso_local void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@1111@Z" +// CHECK-SAME: (<4 x float> inreg %x, +// CHECK-SAME: <4 x float> inreg %y, +// CHECK-SAME: <4 x float> inreg %z, +// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %0) + +void pass_vec_128() { + __m128 z = {0}; + receive_vec_128(NonTrivial(), z, z, z, z, z); +} +// CHECK-LABEL: define dso_local void @"?pass_vec_128@@YAXXZ"() +// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 0 +// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* %{{.*}}) + +// Store q, store temp alloca. +// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16 +// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 1 +// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4 + +// Store w, store temp alloca. +// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16 +// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 2 +// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4 + +// CHECK: call void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@1111@Z" +// CHECK-SAME: (<4 x float> inreg %{{[^,]*}}, +// CHECK-SAME: <4 x float> inreg %{{[^,]*}}, +// CHECK-SAME: <4 x float> inreg %{{[^,]*}}, +// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %{{[^,]*}}) + +// w will be passed indirectly by register, and q will be passed indirectly, but +// the pointer will be in memory. +void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q, NonTrivial nt) { + gv128 = x + y + z + w + q; +} +// CHECK-LABEL: define dso_local x86_fastcallcc void @"?fastcall_receive_vec@@Y{{[^"]*}}" +// CHECK-SAME: (<4 x float> inreg %x, +// CHECK-SAME: <4 x float> inreg %y, +// CHECK-SAME: <4 x float> inreg %z, +// CHECK-SAME: <4 x float>* inreg %0, +// CHECK-SAME: i32 inreg %edx, +// CHECK-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1) + + +void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2, + __m128 x, __m128 y, __m128 z, + __m128 w, int edx, __m128 q, NonTrivial nt) { + gv128 = x + y + z + w + q; +} +// FIXME: Enable these checks, clang generates wrong IR. +// CHECK-LABEL: define dso_local x86_vectorcallcc void @"?vectorcall_receive_vec@@Y{{[^"]*}}" +// CHECKX-SAME: (double inreg %xmm0, +// CHECKX-SAME: double inreg %xmm1, +// CHECKX-SAME: double inreg %xmm2, +// CHECKX-SAME: <4 x float> inreg %x, +// CHECKX-SAME: <4 x float> inreg %y, +// CHECKX-SAME: <4 x float> inreg %z, +// CHECKX-SAME: <4 x float>* inreg %0, +// CHECKX-SAME: i32 inreg %edx, +// CHECKX-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)