diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -1318,6 +1318,7 @@ BCLalways, BCLn, BCTRL8_LDinto_toc, + BCTRL_LWZinto_toc, BCn, CTRL_DEP )>; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2441,10 +2441,6 @@ } unsigned PPCFrameLowering::getTOCSaveOffset() const { - if (Subtarget.isAIXABI()) - // TOC save/restore is normally handled by the linker. - // Indirect calls should hit this limitation. - report_fatal_error("TOC save is not implemented on AIX yet."); return TOCSaveOffset; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -174,7 +174,8 @@ BCTRL, /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl - /// instruction and the TOC reload required on SVR4 PPC64. + /// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX + /// and 64-bit AIX. BCTRL_LOAD_TOC, /// Return with a flag operand, matched by 'blr' diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3151,11 +3151,17 @@ SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget.isAIXABI()) + report_fatal_error("ADJUST_TRAMPOLINE operation not supported on AIX."); + return Op.getOperand(0); } SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget.isAIXABI()) + report_fatal_error("INIT_TRAMPOLINE operation not supported on AIX."); + SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -5190,34 +5196,48 @@ MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); + // Registers used in building the DAG. + const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister(); + const MCRegister TOCReg = Subtarget.getTOCPointerRegister(); + + // Offsets of descriptor members. + const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset(); + const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset(); + + const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4; + // One load for the functions entry point address. - SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, - /* Alignment = */ 8, MMOFlags); + SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI, + Alignment, MMOFlags); // One for loading the TOC anchor for the module that contains the called // function. - SDValue TOCOff = DAG.getIntPtrConstant(8, dl); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); + SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff); SDValue TOCPtr = - DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), - /* Alignment = */ 8, MMOFlags); + DAG.getLoad(RegVT, dl, LDChain, AddTOC, + MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags); // One for loading the environment pointer. - SDValue PtrOff = DAG.getIntPtrConstant(16, dl); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); + SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff); SDValue LoadEnvPtr = - DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), - /* Alignment = */ 8, MMOFlags); + DAG.getLoad(RegVT, dl, LDChain, AddPtr, + MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags); + // Then copy the newly loaded TOC anchor to the TOC pointer. - SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, Glue); + SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue); Chain = TOCVal.getValue(0); Glue = TOCVal.getValue(1); // If the function call has an explicit 'nest' parameter, it takes the // place of the environment pointer. + assert((!hasNest || !Subtarget.isAIXABI()) && + "Nest parameter not supported on AIX"); if (!hasNest) { - SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, Glue); + SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue); Chain = EnvVal.getValue(0); Glue = EnvVal.getValue(1); } @@ -5235,6 +5255,10 @@ &RegsToPass, SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff, const PPCSubtarget &Subtarget, bool isIndirect) { + const bool IsPPC64 = Subtarget.isPPC64(); + // MVT for a general purpose register. + const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + // First operand is always the chain. Ops.push_back(Chain); @@ -5243,28 +5267,29 @@ Ops.push_back(Callee); else { assert(!isPatchPoint && "Patch point call are not indirect."); - if (Subtarget.isAIXABI()) - report_fatal_error("Indirect call on AIX is not implemented."); // We created a save of the TOC pointer to the toc save area on the stack in - // the first part of LowerCall_64SVR4. Here we add the 'ADD' instruction as - // an operand to the call. - if (Subtarget.is64BitELFABI()) { - SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + // the first part of LowerCall_64SVR4 or LowerCall_AIX. Here we add the + // 'ADD' instruction as an operand to the call. + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { + const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); + + SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff); Ops.push_back(AddTOC); } // Add the register used for the environment pointer. if (Subtarget.usesFunctionDescriptors() && !hasNest) - Ops.push_back(DAG.getRegister(PPC::X11, MVT::i64)); + Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(), + RegVT)); + // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) - Ops.push_back(DAG.getRegister(Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, - Subtarget.isPPC64() ? MVT::i64 : MVT::i32)); + Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT)); } // If this is a tail call add stack pointer delta. @@ -5281,8 +5306,7 @@ // no way to mark dependencies as implicit here. // We will add the R2/X2 dependency in EmitInstrWithCustomInserter. if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint) - Ops.push_back(DAG.getRegister(Subtarget.isPPC64() ? PPC::X2 : PPC::R2, - Subtarget.isPPC64() ? MVT::i64 : MVT::i32)); + Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT)); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.is32BitELFABI()) @@ -6829,9 +6853,6 @@ if (isVarArg || isPatchPoint) report_fatal_error("This call type is unimplemented on AIX."); - if (!isFunctionGlobalAddress(Callee) && !isa(Callee)) - report_fatal_error("Handling of indirect call is unimplemented!"); - const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); if (Subtarget.hasQPX()) @@ -6890,6 +6911,26 @@ "unimplemented!"); } + // For indirect calls, we need to save the TOC base to the stack for + // restoration after the call. + if (!isTailCall && !isPatchPoint && + !isFunctionGlobalAddress(Callee) && !isa(Callee)) { + const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister(); + const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); + const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const unsigned TOCSaveOffset = + Subtarget.getFrameLowering()->getTOCSaveOffset(); + + setUsesTOCBasePtr(DAG); + SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); + SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack( + DAG.getMachineFunction(), TOCSaveOffset)); + } + // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -1529,6 +1529,29 @@ let BH = 0; } +class XLForm_2_ext_and_DForm_1 opcode1, bits<10>xo1, bits<5> bo, + bits<5> bi, bits<2> bh, bit lk, bits<6> opcode2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I2 { + + bits<5> RST; + bits<19> DS_RA; + + let Pattern = pattern; + + let Inst{6-10} = bo; + let Inst{11-15} = bi; + let Inst{16-18} = 0; // unused. + let Inst{19-20} = bh; + let Inst{21-30} = xo1; + let Inst{31} = lk; + + let Inst{38-42} = RST; + let Inst{43-47} = DS_RA{18-14}; // Register # + let Inst{48-61} = DS_RA{13-0}; // Displacement. +} + // 1.7.8 XFX-Form class XFXForm_1 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin> diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1645,6 +1645,15 @@ "#TC_RETURNr $dst $offset", []>; +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2], Uses = [CTR, RM], RST = 2 in { + def BCTRL_LWZinto_toc: + XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 0, 1, 32, (outs), + (ins memrix:$src), "bctrl\n\tlwz 2, $src", IIC_BrB, + [(PPCbctrl_load_toc iaddrX4:$src)]>, Requires<[In32BitMode]>; + +} + let isCodeGenOnly = 1 in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -354,6 +354,34 @@ return isAIXABI() || (is64BitELFABI() && !isELFv2ABI()); } + unsigned descriptorTOCAnchorOffset() const { + assert(usesFunctionDescriptors() && + "should only be called when target uses descriptors."); + return IsPPC64 ? 8 : 4; + } + + unsigned descriptorEnvironmentPointerOffset() const { + assert(usesFunctionDescriptors() && + "should only be called when target uses descriptors."); + return IsPPC64 ? 16 : 8; + } + + MCRegister getEnvironmentPointerRegister() const { + assert(usesFunctionDescriptors() && + "should only be called when target uses descriptors."); + return IsPPC64 ? PPC::X11 : PPC::R11; + } + + MCRegister getTOCPointerRegister() const { + assert((is64BitELFABI() || isAIXABI()) && + "should only be called when target is TOC based ABI."); + return IsPPC64 ? PPC::X2 : PPC::R2; + } + + MCRegister getStackPointerRegister() const { + return IsPPC64 ? PPC::X1 : PPC::R1; + } + bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } }; } // End llvm namespace diff --git a/llvm/test/CodeGen/PowerPC/aix-trampoline.ll b/llvm/test/CodeGen/PowerPC/aix-trampoline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-trampoline.ll @@ -0,0 +1,14 @@ +; RUN: not llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: INIT_TRAMPOLINE operation not supported on AIX. + +define void @create_trampoline(i8* %buffer, i8* %nval) nounwind { +entry: + call void @llvm.init.trampoline(i8* %buffer, i8* bitcast (i32 (i32)* @nested to i8*) , i8* %nval) + ret void +} + +declare i32 @nested(i32); + +declare void @llvm.init.trampoline(i8*, i8*, i8*) nounwind diff --git a/llvm/test/CodeGen/PowerPC/aix_indirect_call.ll b/llvm/test/CodeGen/PowerPC/aix_indirect_call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix_indirect_call.ll @@ -0,0 +1,129 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec \ +; RUN: -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp < %s | \ +; RUN: FileCheck --check-prefixes=CHECKMIR,MIR32 %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec \ +; RUN: -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp < %s | \ +; RUN: FileCheck --check-prefixes=CHECKMIR,MIR64 %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s + +define signext i32 @callThroughPtr(i32 ()* nocapture) { + %2 = tail call signext i32 %0() + ret i32 %2 +} + +; CHECKMIR: name: callThroughPtr + +; MIR32: liveins: $r3 +; MIR32: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 +; MIR32-NEXT: renamable $r11 = LWZ 8, renamable $r3 :: (dereferenceable invariant load 4 from %ir.0 + 8) +; MIR32-NEXT: renamable $r4 = LWZ 0, renamable $r3 :: (dereferenceable invariant load 4 from %ir.0) +; MIR32-NEXT: STW $r2, 20, $r1 +; MIR32-NEXT: $r2 = LWZ 4, killed renamable $r3 :: (dereferenceable invariant load 4 from %ir.0 + 4) +; MIR32-NEXT: MTCTR killed renamable $r4, implicit-def $ctr +; MIR32-NEXT: BCTRL_LWZinto_toc 20, $r1, csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $ctr, implicit $rm, implicit $r11, implicit $r2, implicit-def $r1, implicit-def $r3 +; MIR32-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 + + +; MIR64: liveins: $x3 +; MIR64: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; MIR64-NEXT: renamable $x11 = LD 16, renamable $x3 :: (dereferenceable invariant load 8 from %ir.0 + 16) +; MIR64-NEXT: renamable $x4 = LD 0, renamable $x3 :: (dereferenceable invariant load 8 from %ir.0) +; MIR64-NEXT: STD $x2, 40, $x1 :: (store 8 into stack + 40) +; MIR64-NEXT: $x2 = LD 8, killed renamable $x3 :: (dereferenceable invariant load 8 from %ir.0 + 8) +; MIR64-NEXT: MTCTR8 killed renamable $x4, implicit-def $ctr8 +; MIR64-NEXT: BCTRL8_LDinto_toc 40, $x1, csr_aix64, implicit-def dead $lr8, implicit-def dead $x2, implicit $ctr8, implicit $rm, implicit $x11, implicit $x2, implicit-def $r1, implicit-def $x3 +; MIR64-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 + +; CHECKASM-LABEL: .callThroughPtr: + +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz 4, 0(3) +; ASM32-NEXT: lwz 11, 8(3) +; ASM32-NEXT: stw 2, 20(1) +; ASM32-NEXT: lwz 2, 4(3) +; ASM32-NEXT: mtctr 4 +; ASM32-NEXT: bctrl +; ASM32-NEXT: lwz 2, 20(1) +; ASM32-NEXT: addi 1, 1, 64 + +; ASM64: stdu 1, -112(1) +; ASM64-NEXT: ld 4, 0(3) +; ASM64-NEXT: ld 11, 16(3) +; ASM64-NEXT: std 2, 40(1) +; ASM64-NEXT: ld 2, 8(3) +; ASM64-NEXT: mtctr 4 +; ASM64-NEXT: bctrl +; ASM64-NEXT: ld 2, 40(1) +; ASM64-NEXT: addi 1, 1, 112 + + +define void @callThroughPtrWithArgs(void (i32, i1, i64)* nocapture) { + tail call void %0(i32 signext 1, i1 zeroext 2, i64 3) + ret void +} + +; CHECKMIR: name: callThroughPtrWithArgs + +; MIR32: liveins: $r3 +; MIR32: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 +; MIR32-NEXT: renamable $r4 = LWZ 0, renamable $r3 :: (dereferenceable invariant load 4 from %ir.0) +; MIR32-NEXT: renamable $r11 = LWZ 8, renamable $r3 :: (dereferenceable invariant load 4 from %ir.0 + 8) +; MIR32-NEXT: STW $r2, 20, $r1 :: (store 4 into stack + 20) +; MIR32-NEXT: $r2 = LWZ 4, killed renamable $r3 :: (dereferenceable invariant load 4 from %ir.0 + 4) +; MIR32-NEXT: MTCTR killed renamable $r4, implicit-def $ctr +; MIR32-NEXT: $r3 = LI 1 +; MIR32-NEXT: $r4 = LI 0 +; MIR32-NEXT: $r5 = LI 0 +; MIR32-NEXT: $r6 = LI 3 +; MIR32-NEXT: BCTRL_LWZinto_toc 20, $r1, csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $ctr, implicit $rm, implicit $r11, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r2, implicit-def $r1 +; MIR32-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 + +; MIR64: liveins: $x3 +; MIR64: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; MIR64-NEXT: renamable $x4 = LD 0, renamable $x3 :: (dereferenceable invariant load 8 from %ir.0) +; MIR64-NEXT: renamable $x11 = LD 16, renamable $x3 :: (dereferenceable invariant load 8 from %ir.0 + 16) +; MIR64-NEXT: STD $x2, 40, $x1 :: (store 8 into stack + 40) +; MIR64-NEXT: $x2 = LD 8, killed renamable $x3 :: (dereferenceable invariant load 8 from %ir.0 + 8) +; MIR64-NEXT: MTCTR8 killed renamable $x4, implicit-def $ctr8 +; MIR64-NEXT: $x3 = LI8 1 +; MIR64-NEXT: $x4 = LI8 0 +; MIR64-NEXT: $x5 = LI8 3 +; MIR64-NEXT: BCTRL8_LDinto_toc 40, $x1, csr_aix64, implicit-def dead $lr8, implicit-def dead $x2, implicit $ctr8, implicit $rm, implicit $x11, implicit $x3, implicit $x4, implicit $x5, implicit $x2, implicit-def $r1 +; MIR64-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 + +; CHECKASM-LABEL: .callThroughPtrWithArgs: + +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz 4, 0(3) +; ASM32-NEXT: lwz 11, 8(3) +; ASM32-NEXT: li 5, 0 +; ASM32-NEXT: li 6, 3 +; ASM32-NEXT: stw 2, 20(1) +; ASM32-NEXT: lwz 2, 4(3) +; ASM32-NEXT: li 3, 1 +; ASM32-NEXT: mtctr 4 +; ASM32-NEXT: li 4, 0 +; ASM32-NEXT: bctrl +; ASM32-NEXT: lwz 2, 20(1) +; ASM32-NEXT: addi 1, 1, 64 + +; ASM64: stdu 1, -112(1) +; ASM64-DAG: ld 4, 0(3) +; ASM64-DAG: ld 11, 16(3) +; ASM64-DAG: li 5, 3 +; ASM64-DAG: std 2, 40(1) +; ASM64-DAG: ld 2, 8(3) +; ASM64-DAG: li 3, 1 +; ASM64-DAG: mtctr 4 +; ASM64-DAG: li 4, 0 +; ASM64: bctrl +; ASM64-NEXT: ld 2, 40(1) +; ASM64-NEXT: addi 1, 1, 112