Index: llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -147,6 +147,7 @@ virtual void assignValueToAddress(const ArgInfo &Arg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) { + assert(Arg.Regs.size() == 1); assignValueToAddress(Arg.Regs[0], Addr, Size, MPO, VA); } Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -313,80 +313,87 @@ EVT VAVT = VA.getValVT(); const LLT OrigTy = getLLTForType(*Args[i].Ty, DL); - if (VA.isRegLoc()) { - if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) { - if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) { - // Expected to be multiple regs for a single incoming arg. - unsigned NumArgRegs = Args[i].Regs.size(); - if (NumArgRegs < 2) - return false; - - assert((j + (NumArgRegs - 1)) < ArgLocs.size() && - "Too many regs for number of args"); - for (unsigned Part = 0; Part < NumArgRegs; ++Part) { - // There should be Regs.size() ArgLocs per argument. - VA = ArgLocs[j + Part]; - Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); - } - j += NumArgRegs - 1; - // Merge the split registers into the expected larger result vreg - // of the original call. - MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs); - continue; - } - const LLT VATy(VAVT.getSimpleVT()); - Register NewReg = - MIRBuilder.getMRI()->createGenericVirtualRegister(VATy); - Handler.assignValueToReg(NewReg, VA.getLocReg(), VA); - // If it's a vector type, we either need to truncate the elements - // or do an unmerge to get the lower block of elements. - if (VATy.isVector() && - VATy.getNumElements() > OrigVT.getVectorNumElements()) { - // Just handle the case where the VA type is 2 * original type. - if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) { - LLVM_DEBUG(dbgs() - << "Incoming promoted vector arg has too many elts"); - return false; - } - auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg}); - MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0)); - } else { - MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0); + // Expected to be multiple regs for a single incoming arg. + // There should be Regs.size() ArgLocs per argument. + unsigned NumArgRegs = Args[i].Regs.size(); + + assert((j + (NumArgRegs - 1)) < ArgLocs.size() && + "Too many regs for number of args"); + for (unsigned Part = 0; Part < NumArgRegs; ++Part) { + // There should be Regs.size() ArgLocs per argument. + VA = ArgLocs[j + Part]; + if (VA.isMemLoc()) { + // Don't currently support loading/storing a type that needs to be split + // to the stack. Should be easy, just not implemented yet. + if (NumArgRegs > 1) { + LLVM_DEBUG( + dbgs() + << "Load/store a split arg to/from the stack not implemented yet\n"); + return false; } - } else if (!Handler.isIncomingArgumentHandler()) { - assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() && - "Too many regs for number of args"); - // This is an outgoing argument that might have been split. - for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) { - // There should be Regs.size() ArgLocs per argument. - VA = ArgLocs[j + Part]; - Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); + + // FIXME: Use correct address space for pointer size + EVT LocVT = VA.getValVT(); + unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize() + : LocVT.getStoreSize(); + unsigned Offset = VA.getLocMemOffset(); + MachinePointerInfo MPO; + Register StackAddr = Handler.getStackAddress(MemSize, Offset, MPO); + Handler.assignValueToAddress(Args[i], StackAddr, + MemSize, MPO, VA); + continue; + } + + assert(VA.isRegLoc() && "custom loc should have been handled already"); + + if (OrigVT.getSizeInBits() >= VAVT.getSizeInBits() || + !Handler.isIncomingArgumentHandler()) { + // This is an argument that might have been split. There should be + // Regs.size() ArgLocs per argument. + + // Insert the argument copies. If VAVT < OrigVT, we'll insert the merge + // to the original register after handling all of the parts. + Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); + continue; + } + + // This ArgLoc covers multiple pieces, so we need to split it. + const LLT VATy(VAVT.getSimpleVT()); + Register NewReg = + MIRBuilder.getMRI()->createGenericVirtualRegister(VATy); + Handler.assignValueToReg(NewReg, VA.getLocReg(), VA); + // If it's a vector type, we either need to truncate the elements + // or do an unmerge to get the lower block of elements. + if (VATy.isVector() && + VATy.getNumElements() > OrigVT.getVectorNumElements()) { + // Just handle the case where the VA type is 2 * original type. + if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) { + LLVM_DEBUG(dbgs() + << "Incoming promoted vector arg has too many elts"); + return false; } - j += Args[i].Regs.size() - 1; + auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg}); + MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0)); } else { - Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA); + MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0); } - } else if (VA.isMemLoc()) { - // Don't currently support loading/storing a type that needs to be split - // to the stack. Should be easy, just not implemented yet. - if (Args[i].Regs.size() > 1) { - LLVM_DEBUG( - dbgs() - << "Load/store a split arg to/from the stack not implemented yet"); - return false; + } + + // Now that all pieces have been handled, re-pack any arguments into any + // wider, original registers. + if (Handler.isIncomingArgumentHandler()) { + if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) { + assert(NumArgRegs >= 2); + + // Merge the split registers into the expected larger result vreg + // of the original call. + MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs); } - MVT VT = MVT::getVT(Args[i].Ty); - unsigned Size = VT == MVT::iPTR ? DL.getPointerSize() - : alignTo(VT.getSizeInBits(), 8) / 8; - unsigned Offset = VA.getLocMemOffset(); - MachinePointerInfo MPO; - Register StackAddr = Handler.getStackAddress(Size, Offset, MPO); - Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA); - } else { - // FIXME: Support byvals and other weirdness - return false; } + + j += NumArgRegs - 1; } + return true; } Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -186,6 +186,8 @@ if (!Arg.IsFixed) MaxSize = 0; + assert(Arg.Regs.size() == 1); + Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt ? extendRegister(Arg.Regs[0], VA, MaxSize) : Arg.Regs[0]; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -144,13 +144,17 @@ } } - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, MachinePointerInfo &MPO, CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); + // The reported memory location may be wider than the value. + const LLT RegTy = MRI.getType(ValVReg); + MemSize = std::min(static_cast(RegTy.getSizeInBytes()), MemSize); + // FIXME: Get alignment auto MMO = MF.getMachineMemOperand( - MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -182,17 +182,6 @@ ret void } -; Currently can't handle dealing with a split type (s128 -> 2 x s64) on the stack yet. -declare void @use_s128(i128 %a, i128 %b) -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to lower arguments: i32 (i32, i128, i32, i32, i32, i128, i32)* (in function: fn1) -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for fn1 -; FALLBACK-WITH-REPORT-OUT-LABEL: fn1: -define i32 @fn1(i32 %p1, i128 %p2, i32 %p3, i32 %p4, i32 %p5, i128 %p6, i32 %p7) { -entry: - call void @use_s128(i128 %p2, i128 %p6) - ret i32 0 -} - ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %2:fpr(<4 x s16>) = G_ZEXT %0:fpr(<4 x s8>) (in function: zext_v4s8) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for zext_v4s8 ; FALLBACK-WITH-REPORT-OUT-LABEL: zext_v4s8 Index: llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll @@ -1,12 +1,29 @@ -; RUN: llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -global-isel -verify-machineinstrs %s -o - | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "aarch64-linux-gnu" ; Check we don't assert when handling an i128 split arg on the stack. -; CHECK-LABEL: fn1 -; CHECK: ret -define i32 @fn1(i32 %p1, i128 %p2.coerce, i32 %p3, i32 %p4, i32 %p5, i128 %p6.coerce, i32 %p7) { +declare void @use_s128(i128, i128) + +define i32 @fn1(i32 %p1, i128 %p2, i32 %p3, i32 %p4, i32 %p5, i128 %p6, i32 %p7) { +; CHECK-LABEL: fn1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: ldr x9, [sp, #24] +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: mov x1, x3 +; CHECK-NEXT: mov x2, x8 +; CHECK-NEXT: mov x3, x9 +; CHECK-NEXT: bl use_s128 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret entry: - ret i32 undef + call void @use_s128(i128 %p2, i128 %p6) + ret i32 0 }