Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -107,6 +107,8 @@ // generate any code for it. GET_CCMASK, + SUBREG128, + // Use a series of MVCs to copy bytes from one memory location to another. // The operands are: // - the target address @@ -517,6 +519,15 @@ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; bool allowTruncateForTailCall(Type *, Type *) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; + bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, + SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, + Optional CC) const override; + SDValue + joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL, + const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, + Optional CC) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, @@ -687,6 +698,8 @@ unsigned BitSize) const; MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *fixupInlineAsm128(MachineInstr &MI, + MachineBasicBlock *MBB) const; MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB, Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1129,10 +1129,10 @@ default: break; case 'd': // Data register (equivalent to 'r') case 'r': // General-purpose register - if (VT == MVT::i64) + // Returning GR128 for i128 would create two GR128 virtual registers - + // better to return GR64 and assume i128 for 2 x GR64. + if (VT == MVT::i64 || VT == MVT::i128) return std::make_pair(0U, &SystemZ::GR64BitRegClass); - else if (VT == MVT::i128) - return std::make_pair(0U, &SystemZ::GR128BitRegClass); return std::make_pair(0U, &SystemZ::GR32BitRegClass); case 'a': // Address register @@ -1365,6 +1365,52 @@ } } +bool SystemZTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, Optional CC) const { + EVT ValueVT = Val.getValueType(); + bool IsABIRegCopy = CC.hasValue(); + if (ValueVT == MVT::i128 && !IsABIRegCopy) { + assert(((PartVT == MVT::i64 && NumParts == 2) == (ValueVT == MVT::i128)) && + "Unknown splitting of i128."); + //Build two SUBREG128 pseudos to identify the i128 subregs properly later. + SDValue Extr0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, + MVT::i64, Val, DAG.getIntPtrConstant(0, DL)); + SDValue Extr1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, + MVT::i64, Val, DAG.getIntPtrConstant(1, DL)); + SDVTList NodeTys = DAG.getVTList(MVT::i64, MVT::Glue); + // Big endian: + Parts[1] = DAG.getNode(SystemZISD::SUBREG128, DL, NodeTys, Extr0, + DAG.getTargetConstant(SystemZ::subreg_l64, DL, MVT::i32)); + Parts[0] = DAG.getNode(SystemZISD::SUBREG128, DL, MVT::i64, Extr1, + DAG.getTargetConstant(SystemZ::subreg_h64, DL, MVT::i32), + Parts[1].getValue(1)); + return true; + } + + return false; +} + +SDValue SystemZTargetLowering::joinRegisterPartsIntoValue( + SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, Optional CC) const { + bool IsABIRegCopy = CC.hasValue(); + if (ValueVT == MVT::i128 && !IsABIRegCopy) { + assert(((PartVT == MVT::i64 && NumParts == 2) == (ValueVT == MVT::i128)) && + "Unknown splitting of i128."); + //Build two SUBREG128 pseudos to identify the i128 subregs properly later. + SDVTList NodeTys = DAG.getVTList(MVT::i64, MVT::Glue); + SDValue SubLo = DAG.getNode(SystemZISD::SUBREG128, DL, NodeTys, Parts[1], + DAG.getTargetConstant(SystemZ::subreg_l64, DL, MVT::i32)); + SDValue SubHi = DAG.getNode(SystemZISD::SUBREG128, DL, MVT::i64, Parts[0], + DAG.getTargetConstant(SystemZ::subreg_h64, DL, MVT::i32), + SubLo.getValue(1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, SubLo, SubHi); + } + + return SDValue(); +} + SDValue SystemZTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, @@ -5604,6 +5650,7 @@ OPCODE(ADDCARRY); OPCODE(SUBCARRY); OPCODE(GET_CCMASK); + OPCODE(SUBREG128); OPCODE(MVC); OPCODE(MVC_LOOP); OPCODE(NC); @@ -7754,6 +7801,171 @@ return MBB; } +static void findSUBREG128Pair(MachineInstr *IASM_MI, unsigned OpIdx, + MachineInstr *&SR128_MI1, + MachineInstr *&SR128_MI2, + MachineRegisterInfo *MRI) { + MachineOperand &MO1 = IASM_MI->getOperand(OpIdx); + MachineOperand &MO2 = IASM_MI->getOperand(OpIdx + 1); + if (MO1.isUse()) { + SR128_MI1 = MRI->getVRegDef(MO1.getReg()); + SR128_MI2 = MRI->getVRegDef(MO2.getReg()); + assert(((SR128_MI1->getOpcode() == SystemZ::SUBREG128) && + (SR128_MI2->getOpcode() == SystemZ::SUBREG128)) && + "Expected two SUBREG128 definitions."); + } + else { // isDef + // There can be either a use of the full value, or just the low 64 bits, + // or no user at all. + SR128_MI1 = nullptr; + SR128_MI2 = nullptr; + if (MRI->hasOneNonDBGUser(MO1.getReg())) + SR128_MI1 = &*MRI->use_instr_nodbg_begin(MO1.getReg()); + if (MRI->hasOneNonDBGUser(MO2.getReg())) + SR128_MI2 = &*MRI->use_instr_nodbg_begin(MO2.getReg()); + assert(((SR128_MI1 == nullptr || + SR128_MI1->getOpcode() == SystemZ::SUBREG128) && + (SR128_MI2 == nullptr || + SR128_MI2->getOpcode() == SystemZ::SUBREG128)) && + "Expected only SUBREG128 or no users."); + } + assert(((SR128_MI1 == nullptr || + SR128_MI1->getOperand(2).getImm() == SystemZ::subreg_h64) && + (SR128_MI2 == nullptr || + SR128_MI2->getOperand(2).getImm() == SystemZ::subreg_l64)) && + "Corrupt SUBREG128 sequence."); +} + +// Pretend during instruction selection that i128 operands to INLINEASM are +// split into GR64 parts. In order to get correct register allocation for +// tied operands they need to be put together here into GR128 bit registers. +MachineBasicBlock *SystemZTargetLowering:: +fixupInlineAsm128(MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + + DebugLoc DL = MI.getDebugLoc(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + // Find the INLINEASM MI. + MachineInstr *IASM_MI = nullptr; + if (MRI->hasOneNonDBGUser(DstReg)) { + IASM_MI = &*MRI->use_instr_nodbg_begin(DstReg); + if (IASM_MI->getOpcode() != SystemZ::INLINEASM) + IASM_MI = nullptr; + } + if (IASM_MI == nullptr && MRI->hasOneDef(SrcReg)) { + IASM_MI = MRI->getVRegDef(SrcReg); + if (IASM_MI->getOpcode() != SystemZ::INLINEASM) + IASM_MI = nullptr; + } + if (IASM_MI == nullptr) { + MI.eraseFromParent(); // Remove other SUBREG128:s already handled. + return MBB; + } + + // Build a new INLINEASM with GR128 operands. + MachineInstrBuilder MIB = BuildMI(*MBB, IASM_MI, IASM_MI->getDebugLoc(), + TII->get(SystemZ::INLINEASM)) + .add(IASM_MI->getOperand(0)) // Asm string + .add(IASM_MI->getOperand(1)); // ExtraInfo + + std::vector AsmOp2MIOp; + unsigned AsmDescOp = InlineAsm::MIOp_FirstOperand; + for (unsigned i = AsmDescOp, e = IASM_MI->getNumOperands(); i != e;) { + const MachineOperand &MO = IASM_MI->getOperand(i); + if (i == AsmDescOp && MO.isImm()) { + // The inline asm operand descriptor. + unsigned Flag = MO.getImm(); + unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(Flag); + AsmDescOp += 1 + NumOpRegs; + AsmOp2MIOp.push_back(i); // Remember the index of the MachineOperand. + // Find out if this is a GR64 operand. + unsigned RCID = 0; + unsigned TiedTo = 0; + if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo)) { + unsigned MIDefOpIdx = AsmOp2MIOp[TiedTo]; + unsigned DefFlag = IASM_MI->getOperand(MIDefOpIdx).getImm(); + InlineAsm::hasRegClassConstraint(DefFlag, RCID); + } else + InlineAsm::hasRegClassConstraint(Flag, RCID); + + if (RCID == SystemZ::GR64BitRegClassID && NumOpRegs == 2) { + MachineInstr *SR128_MI1, *SR128_MI2; + findSUBREG128Pair(IASM_MI, i + 1, SR128_MI1, SR128_MI2, MRI); + bool IsUse = IASM_MI->getOperand(i + 1).isUse(); + // Update the new Flag operand: + // Set number of registers to 1. + Flag &= ~(0xfff8); + Flag |= (1 << 3); + assert(InlineAsm::getNumOperandRegisters(Flag) == 1 && "Bad bits."); + // Set the regclass to GR128. + if (!IsUse || !IASM_MI->getOperand(i + 1).isTied()) { + Flag &= ~(0xffff << 16); + Flag = InlineAsm::getFlagWordForRegClass(Flag, + SystemZ::GR128BitRegClassID); + #ifndef NDEBUG + unsigned RC; + InlineAsm::hasRegClassConstraint(Flag, RC); + assert(RC == SystemZ::GR128BitRegClassID && "Bad bits."); + #endif + } + // Combine the two parts to a GR128: + Register Reg128 = MRI->createVirtualRegister(&SystemZ::GR128BitRegClass); + if (IsUse) { + MachineOperand &InReg1 = SR128_MI1->getOperand(1); + MachineOperand &InReg2 = SR128_MI2->getOperand(1); + BuildMI(*MBB, *MIB, DL, TII->get(SystemZ::REG_SEQUENCE), Reg128) + .add(InReg1).addImm(SystemZ::subreg_h64) + .add(InReg2).addImm(SystemZ::subreg_l64); + } else { // def + MachineBasicBlock::iterator InsPt = std::next(MIB->getIterator()); + if (SR128_MI1 != nullptr) + BuildMI(*MBB, InsPt, DL, TII->get(SystemZ::COPY), + SR128_MI1->getOperand(0).getReg()) + .addReg(Reg128, 0, SystemZ::subreg_h64); + if (SR128_MI2 != nullptr) + BuildMI(*MBB, InsPt, DL, TII->get(SystemZ::COPY), + SR128_MI2->getOperand(0).getReg()) + .addReg(Reg128, 0, SystemZ::subreg_l64); + } + MIB.addImm(Flag); + MIB.addReg(Reg128, (IsUse ? 0 : RegState::Define)); + i = AsmDescOp; + continue; + } + } + MIB.add(IASM_MI->getOperand(i++)); + } + + // Tie the registers according to the asm operand descriptors. + AsmOp2MIOp.clear(); + AsmDescOp = InlineAsm::MIOp_FirstOperand; + for (unsigned i = AsmDescOp, e = MIB->getNumOperands(); i != e; i++) { + const MachineOperand &MO = MIB->getOperand(i); + if (i == AsmDescOp && MO.isImm()) { + unsigned Flag = MO.getImm(); // The inline asm operand descriptor. + unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(Flag); + AsmDescOp += 1 + NumOpRegs; + AsmOp2MIOp.push_back(i); // Remember the index of the MachineOperand. + + unsigned TiedTo = 0; + if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo)) { + unsigned MIDefOpIdx = AsmOp2MIOp[TiedTo]; + for (unsigned Reg = 1; Reg <= NumOpRegs; Reg++) + MIB->tieOperands(MIDefOpIdx + Reg, i + Reg); + } + } + } + + IASM_MI->eraseFromParent(); + MI.eraseFromParent(); + return MBB; +} + MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { MachineFunction &MF = *MBB->getParent(); @@ -8223,6 +8435,8 @@ return emitExt128(MI, MBB, false); case SystemZ::ZEXT128: return emitExt128(MI, MBB, true); + case SystemZ::SUBREG128: + return fixupInlineAsm128(MI, MBB); case SystemZ::ATOMIC_SWAPW: return emitAtomicLoadBinary(MI, MBB, 0, 0); Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -519,6 +519,10 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>; +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in + def SUBREG128 : Pseudo<(outs GR64:$dst), (ins GR64:$src, i32imm:$subreg), + [(set GR64:$dst, (z_subreg_128 GR64:$src, timm:$subreg))]>; + //===----------------------------------------------------------------------===// // Conditional move instructions //===----------------------------------------------------------------------===// Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -93,6 +93,10 @@ SDTCisPtrTy<2>, SDTCisVT<3, untyped>, SDTCisVT<4, untyped>]>; +def SDT_ZSubreg128 : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; def SDT_ZMemMemLength : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -414,6 +418,8 @@ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def z_subreg_128 : SDNode<"SystemZISD::SUBREG128", SDT_ZSubreg128, + [SDNPOutGlue, SDNPOptInGlue]>; def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop, Index: llvm/test/CodeGen/SystemZ/inline-asm-i128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/inline-asm-i128.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=s390x-linux-gnu < %s | FileCheck %s +; +; Test i128 (tied) operands. + +define i32 @clcl(i8* %p1, i32 signext %l1, i8* %p2, i32 signext %l2, i8 zeroext %pad) { +; CHECK-LABEL: clcl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r12, %r15, 96(%r15) +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lgr %r0, %r5 +; CHECK-NEXT: # kill: def $r4d killed $r4d def $r4q +; CHECK-NEXT: lgr %r12, %r2 +; CHECK-NEXT: sllg %r5, %r6, 24 +; CHECK-NEXT: rosbg %r5, %r0, 40, 63, 0 +; CHECK-NEXT: risbg %r13, %r3, 40, 191, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: clcl %r12, %r4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ogr %r13, %r5 +; CHECK-NEXT: risbg %r0, %r13, 40, 191, 0 +; CHECK-NEXT: ipm %r2 +; CHECK-NEXT: afi %r2, -268435456 +; CHECK-NEXT: srl %r2, 31 +; CHECK-NEXT: lmg %r12, %r15, 96(%r15) +; CHECK-NEXT: br %r14 +entry: + %0 = ptrtoint i8* %p1 to i64 + %1 = ptrtoint i8* %p2 to i64 + %and5 = and i32 %l2, 16777215 + %2 = zext i32 %and5 to i64 + %conv7 = zext i8 %pad to i64 + %shl = shl nuw nsw i64 %conv7, 24 + %or = or i64 %shl, %2 + %u1.sroa.0.0.insert.ext = zext i64 %0 to i128 + %u1.sroa.0.0.insert.shift = shl nuw i128 %u1.sroa.0.0.insert.ext, 64 + %3 = and i32 %l1, 16777215 + %u1.sroa.0.0.insert.mask = zext i32 %3 to i128 + %u1.sroa.0.0.insert.insert = or i128 %u1.sroa.0.0.insert.shift, %u1.sroa.0.0.insert.mask + %u2.sroa.5.0.insert.ext = zext i64 %or to i128 + %u2.sroa.0.0.insert.ext = zext i64 %1 to i128 + %u2.sroa.0.0.insert.shift = shl nuw i128 %u2.sroa.0.0.insert.ext, 64 + %u2.sroa.0.0.insert.insert = or i128 %u2.sroa.0.0.insert.shift, %u2.sroa.5.0.insert.ext + %4 = tail call { i128, i128 } asm "CLCL $0,$1", "=r,=r,0,1"(i128 %u1.sroa.0.0.insert.insert, i128 %u2.sroa.0.0.insert.insert) + %asmresult = extractvalue { i128, i128 } %4, 0 + %asmresult11 = extractvalue { i128, i128 } %4, 1 + %5 = or i128 %asmresult, %asmresult11 + %6 = and i128 %5, 16777215 + %7 = icmp eq i128 %6, 0 + %land.ext = zext i1 %7 to i32 + ret i32 %land.ext +}