Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -313,80 +313,95 @@ EVT VAVT = VA.getValVT(); const LLT OrigTy = getLLTForType(*Args[i].Ty, DL); - if (VA.isRegLoc()) { - if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) { - if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) { - // Expected to be multiple regs for a single incoming arg. - unsigned NumArgRegs = Args[i].Regs.size(); - if (NumArgRegs < 2) - return false; - - assert((j + (NumArgRegs - 1)) < ArgLocs.size() && - "Too many regs for number of args"); - for (unsigned Part = 0; Part < NumArgRegs; ++Part) { - // There should be Regs.size() ArgLocs per argument. - VA = ArgLocs[j + Part]; - Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); - } - j += NumArgRegs - 1; - // Merge the split registers into the expected larger result vreg - // of the original call. - MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs); - continue; - } - const LLT VATy(VAVT.getSimpleVT()); - Register NewReg = - MIRBuilder.getMRI()->createGenericVirtualRegister(VATy); - Handler.assignValueToReg(NewReg, VA.getLocReg(), VA); - // If it's a vector type, we either need to truncate the elements - // or do an unmerge to get the lower block of elements. - if (VATy.isVector() && - VATy.getNumElements() > OrigVT.getVectorNumElements()) { - // Just handle the case where the VA type is 2 * original type. - if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) { - LLVM_DEBUG(dbgs() - << "Incoming promoted vector arg has too many elts"); - return false; - } - auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg}); - MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0)); - } else { - MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0); + // Expected to be multiple regs for a single incoming arg. + // There should be Regs.size() ArgLocs per argument. + unsigned NumArgRegs = Args[i].Regs.size(); + + assert((j + (NumArgRegs - 1)) < ArgLocs.size() && + "Too many regs for number of args"); + for (unsigned Part = 0; Part < NumArgRegs; ++Part) { + // There should be Regs.size() ArgLocs per argument. + VA = ArgLocs[j + Part]; + if (VA.isMemLoc()) { + // Don't currently support loading/storing a type that needs to be split + // to the stack. Should be easy, just not implemented yet. + if (NumArgRegs > 1) { + LLVM_DEBUG( + dbgs() + << "Load/store a split arg to/from the stack not implemented yet\n"); + return false; } - } else if (!Handler.isIncomingArgumentHandler()) { - assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() && - "Too many regs for number of args"); - // This is an outgoing argument that might have been split. - for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) { - // There should be Regs.size() ArgLocs per argument. - VA = ArgLocs[j + Part]; - Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); + + // It's possible for the calling convention to report a wider type size + // than the value, so clamp to the actual value size for the memory + // access to avoid emitting an invalid load/store. + // + // TODO: Would it be better to emit the type size, and trunc/extend to + // the value size? + unsigned ValSize = std::min(VAVT.getSizeInBits(), + OrigVT.getSizeInBits()); + + // FIXME: Use correct address space for pointer size + unsigned Size = VAVT == MVT::iPTR ? DL.getPointerSize() + : alignTo(ValSize, 8) / 8; + unsigned Offset = VA.getLocMemOffset(); + MachinePointerInfo MPO; + Register StackAddr = Handler.getStackAddress(Size, Offset, MPO); + Handler.assignValueToAddress(Args[i].Regs[Part], StackAddr, + Size, MPO, VA); + continue; + } + + assert(VA.isRegLoc() && "custom loc should have been handled already"); + + if (OrigVT.getSizeInBits() >= VAVT.getSizeInBits() || + !Handler.isIncomingArgumentHandler()) { + // This is an argument that might have been split. There should be + // Regs.size() ArgLocs per argument. + + // Insert the argumet copies. If VAVT < OrigVT, we'll insert the merge + // to the original register after handling all of the parts. + Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); + continue; + } + + // This ArgLoc covers multiple pieces, so we need to split it. + const LLT VATy(VAVT.getSimpleVT()); + Register NewReg = + MIRBuilder.getMRI()->createGenericVirtualRegister(VATy); + Handler.assignValueToReg(NewReg, VA.getLocReg(), VA); + // If it's a vector type, we either need to truncate the elements + // or do an unmerge to get the lower block of elements. + if (VATy.isVector() && + VATy.getNumElements() > OrigVT.getVectorNumElements()) { + // Just handle the case where the VA type is 2 * original type. + if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) { + LLVM_DEBUG(dbgs() + << "Incoming promoted vector arg has too many elts"); + return false; } - j += Args[i].Regs.size() - 1; + auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg}); + MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0)); } else { - Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA); + MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0); } - } else if (VA.isMemLoc()) { - // Don't currently support loading/storing a type that needs to be split - // to the stack. Should be easy, just not implemented yet. - if (Args[i].Regs.size() > 1) { - LLVM_DEBUG( - dbgs() - << "Load/store a split arg to/from the stack not implemented yet"); - return false; + } + + // Now that all pieces have been handled, re-pack any arguments into any + // wider, original registers. + if (Handler.isIncomingArgumentHandler()) { + if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) { + assert(NumArgRegs >= 2); + + // Merge the split registers into the expected larger result vreg + // of the original call. + MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs); } - MVT VT = MVT::getVT(Args[i].Ty); - unsigned Size = VT == MVT::iPTR ? DL.getPointerSize() - : alignTo(VT.getSizeInBits(), 8) / 8; - unsigned Offset = VA.getLocMemOffset(); - MachinePointerInfo MPO; - Register StackAddr = Handler.getStackAddress(Size, Offset, MPO); - Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA); - } else { - // FIXME: Support byvals and other weirdness - return false; } + + j += NumArgRegs - 1; } + return true; } Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll @@ -23,16 +23,13 @@ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s8) - ; CHECK: G_STORE [[ANYEXT]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1) + ; CHECK: G_STORE [[C3]](s8), [[PTR_ADD]](p0) :: (store 1 into stack) ; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s16) - ; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1) + ; CHECK: G_STORE [[C4]](s16), [[PTR_ADD1]](p0) :: (store 2 into stack + 8, align 1) ; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32) - ; CHECK: G_STORE [[ANYEXT2]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1) + ; CHECK: G_STORE [[C5]](s32), [[PTR_ADD2]](p0) :: (store 4 into stack + 16, align 1) ; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64) ; CHECK: G_STORE [[C6]](s32), [[PTR_ADD3]](p0) :: (store 4 into stack + 24, align 1) Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -182,17 +182,6 @@ ret void } -; Currently can't handle dealing with a split type (s128 -> 2 x s64) on the stack yet. -declare void @use_s128(i128 %a, i128 %b) -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to lower arguments: i32 (i32, i128, i32, i32, i32, i128, i32)* (in function: fn1) -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for fn1 -; FALLBACK-WITH-REPORT-OUT-LABEL: fn1: -define i32 @fn1(i32 %p1, i128 %p2, i32 %p3, i32 %p4, i32 %p5, i128 %p6, i32 %p7) { -entry: - call void @use_s128(i128 %p2, i128 %p6) - ret i32 0 -} - ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %2:fpr(<4 x s16>) = G_ZEXT %0:fpr(<4 x s8>) (in function: zext_v4s8) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for zext_v4s8 ; FALLBACK-WITH-REPORT-OUT-LABEL: zext_v4s8 Index: llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-i128-on-stack.ll @@ -1,12 +1,29 @@ -; RUN: llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -global-isel -verify-machineinstrs %s -o - | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "aarch64-linux-gnu" ; Check we don't assert when handling an i128 split arg on the stack. -; CHECK-LABEL: fn1 -; CHECK: ret -define i32 @fn1(i32 %p1, i128 %p2.coerce, i32 %p3, i32 %p4, i32 %p5, i128 %p6.coerce, i32 %p7) { +declare void @use_s128(i128, i128) + +define i32 @fn1(i32 %p1, i128 %p2, i32 %p3, i32 %p4, i32 %p5, i128 %p6, i32 %p7) { +; CHECK-LABEL: fn1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: ldr x9, [sp, #24] +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: mov x1, x3 +; CHECK-NEXT: mov x2, x8 +; CHECK-NEXT: mov x3, x9 +; CHECK-NEXT: bl use_s128 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret entry: - ret i32 undef + call void @use_s128(i128 %p2, i128 %p6) + ret i32 0 } Index: llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll @@ -68,8 +68,7 @@ ; CHECK: [[SP:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[SLOT:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP]], [[OFFSET]](s64) -; CHECK: [[ANSWER_EXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ANSWER]] -; CHECK: G_STORE [[ANSWER_EXT]](s64), [[SLOT]] +; CHECK: G_STORE [[ANSWER]](s32), [[SLOT]] ; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK: [[SLOT:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP]], [[OFFSET]](s64) Index: llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll @@ -262,8 +262,9 @@ ; CHECK-LABEL: caller4: ; CHECK: mov [[ID:x[0-9]+]], x0 -; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] -; CHECK: str {{x[0-9]+}}, [sp, #16] +; CHECK: str {{w[0-9]+}}, [sp] +; CHECK-NEXT: str {{w[0-9]+}}, [sp, #8] +; CHECK-NEXT: str {{w[0-9]+}}, [sp, #16] ; CHECK: mov x21, xzr ; CHECK: bl {{.*}}foo_vararg