Index: lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -304,6 +304,8 @@ void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0, uint64_t UpperVal, uint64_t LowerVal); + bool tryReplicateConstantSplat(BuildVectorSDNode *BVN); + // Try to use gather instruction Opcode to implement vector insertion N. bool tryGather(SDNode *N, unsigned Opcode); @@ -1132,6 +1134,48 @@ SelectCode(Or.getNode()); } +// Try to load a vector constant in which BitsPerElement-bit value Value +// is replicated to fill the vector. VT is the type of the resulting +// constant, which may have elements of a different size from BitsPerElement. +// Return the SDValue of the constant on success, otherwise return +// an empty value. +bool SystemZDAGToDAGISel::tryReplicateConstantSplat(BuildVectorSDNode *BVN) { + const SystemZInstrInfo *TII = getInstrInfo(); + int64_t ReplicatedImm; + unsigned RotateStart, RotateEnd; + MVT VecVT; + if (!SystemZTargetLowering::analyzeBVNForConstantReplication( + BVN, ReplicatedImm, RotateStart, RotateEnd, VecVT, TII)) + return false; + + SDLoc DL(BVN); + EVT VT = BVN->getValueType(0); + SDValue Op; + SDValue BitCast; + if (ReplicatedImm != INT64_MAX) { + Op = CurDAG->getNode(SystemZISD::REPLICATE, DL, VecVT, + CurDAG->getConstant(ReplicatedImm, DL, MVT::i32, false, + true /*isOpaque*/)); + BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); + } else { + Op = CurDAG->getNode( + SystemZISD::ROTATE_MASK, DL, VecVT, + CurDAG->getConstant(RotateStart, DL, MVT::i32, false, + true /*isOpaque*/), + CurDAG->getConstant(RotateEnd, DL, MVT::i32, false, true /*isOpaque*/)); + BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); + } + + ReplaceNode(BVN, BitCast.getNode()); + SelectCode(BitCast.getNode()); + if (Op != BitCast) { + assert(!Op.use_empty() && "Expected bitcasted SDValue to remain in DAG"); + SelectCode(Op.getNode()); + } + + return true; +} + bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) { SDValue ElemV = N->getOperand(2); auto *ElemN = dyn_cast(ElemV); @@ -1538,6 +1582,8 @@ ReplaceNode(Node, Res); return; } + if (tryReplicateConstantSplat(BVN)) + return; break; } Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H #include "SystemZ.h" +#include "SystemZInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -513,6 +514,11 @@ } static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask); + static bool analyzeBVNForConstantReplication(BuildVectorSDNode *BVN, + int64_t &ReplicatedImm, + unsigned &RotateStart, + unsigned &RotateEnd, MVT &VecVT, + const SystemZInstrInfo *TII); private: const SystemZSubtarget &Subtarget; @@ -638,6 +644,8 @@ MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const; + MachineBasicBlock *emitFPScalarImm(MachineInstr &MI, + MachineBasicBlock *MBB) const; const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; }; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -577,9 +577,25 @@ return false; } +static bool analyzeFPImm(const APFloat &Imm, unsigned &Start, unsigned &End, + const SystemZInstrInfo *TII) { + APInt IntImm = Imm.bitcastToAPInt(); + if (IntImm.getActiveBits() > 64) + return false; + + // See if this immediate could be generated with VGM. + return TII->isRxSBGMask(uint64_t(IntImm.getZExtValue()), 64, Start, End); +} + bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // We can load zero using LZ?R and negative zero using LZ?R;LC?BR. - return Imm.isZero() || Imm.isNegZero(); + if (Imm.isZero() || Imm.isNegZero()) + return true; + + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + unsigned Start, End; + return analyzeFPImm(Imm, Start, End, TII); } bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const { @@ -4288,49 +4304,6 @@ return true; } -// Try to load a vector constant in which BitsPerElement-bit value Value -// is replicated to fill the vector. VT is the type of the resulting -// constant, which may have elements of a different size from BitsPerElement. -// Return the SDValue of the constant on success, otherwise return -// an empty value. -static SDValue tryBuildVectorReplicate(SelectionDAG &DAG, - const SystemZInstrInfo *TII, - const SDLoc &DL, EVT VT, uint64_t Value, - unsigned BitsPerElement) { - // Signed 16-bit values can be replicated using VREPI. - // Mark the constants as opaque or DAGCombiner will convert back to - // BUILD_VECTOR. - int64_t SignedValue = SignExtend64(Value, BitsPerElement); - if (isInt<16>(SignedValue)) { - MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), - SystemZ::VectorBits / BitsPerElement); - SDValue Op = DAG.getNode( - SystemZISD::REPLICATE, DL, VecVT, - DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/)); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); - } - // See whether rotating the constant left some N places gives a value that - // is one less than a power of 2 (i.e. all zeros followed by all ones). - // If so we can use VGM. - unsigned Start, End; - if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) { - // isRxSBGMask returns the bit numbers for a full 64-bit value, - // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to - // bit numbers for an BitsPerElement value, so that 0 denotes - // 1 << (BitsPerElement-1). - Start -= 64 - BitsPerElement; - End -= 64 - BitsPerElement; - MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), - SystemZ::VectorBits / BitsPerElement); - SDValue Op = DAG.getNode( - SystemZISD::ROTATE_MASK, DL, VecVT, - DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/), - DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/)); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); - } - return SDValue(); -} - // If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually // better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for // the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR @@ -4529,10 +4502,55 @@ return Result; } +bool SystemZTargetLowering::analyzeBVNForConstantReplication( + BuildVectorSDNode *BVN, int64_t &ReplicatedImm, unsigned &RotateStart, + unsigned &RotateEnd, MVT &VecVT, const SystemZInstrInfo *TII) { + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!(BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + 8, true) && + SplatBitSize <= 64)) + return false; + VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize), + SystemZ::VectorBits / SplatBitSize); + ReplicatedImm = INT64_MAX; + auto tryValue = [&](uint64_t Value) -> bool { + int64_t SignedValue = SignExtend64(Value, SplatBitSize); + if (isInt<16>(SignedValue)) { + ReplicatedImm = SignedValue; + return true; + } + if (TII->isRxSBGMask(Value, SplatBitSize, RotateStart, RotateEnd)) { + RotateStart -= 64 - SplatBitSize; + RotateEnd -= 64 - SplatBitSize; + return true; + } + return false; + }; + + // First try assuming that any undefined bits above the highest set bit + // and below the lowest set bit are 1s. This increases the likelihood of + // being able to use a sign-extended element value in VECTOR REPLICATE + // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. + uint64_t SplatBitsZ = SplatBits.getZExtValue(); + uint64_t SplatUndefZ = SplatUndef.getZExtValue(); + uint64_t Lower = + (SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); + uint64_t Upper = + (SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); + if (tryValue(SplatBitsZ | Upper | Lower)) + return true; + + // Now try assuming that any undefined bits between the first and + // last defined set bits are set. This increases the chances of + // using a non-wraparound mask. + uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; + return tryValue(SplatBitsZ | Middle); +} + SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - const SystemZInstrInfo *TII = - static_cast(Subtarget.getInstrInfo()); auto *BVN = cast(Op.getNode()); SDLoc DL(Op); EVT VT = Op.getValueType(); @@ -4548,37 +4566,14 @@ return Op; // Try using some form of replication. - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, - 8, true) && - SplatBitSize <= 64) { - // First try assuming that any undefined bits above the highest set bit - // and below the lowest set bit are 1s. This increases the likelihood of - // being able to use a sign-extended element value in VECTOR REPLICATE - // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK. - uint64_t SplatBitsZ = SplatBits.getZExtValue(); - uint64_t SplatUndefZ = SplatUndef.getZExtValue(); - uint64_t Lower = (SplatUndefZ - & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1)); - uint64_t Upper = (SplatUndefZ - & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1)); - uint64_t Value = SplatBitsZ | Upper | Lower; - SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, - SplatBitSize); - if (Op.getNode()) - return Op; - - // Now try assuming that any undefined bits between the first and - // last defined set bits are set. This increases the chances of - // using a non-wraparound mask. - uint64_t Middle = SplatUndefZ & ~Upper & ~Lower; - Value = SplatBitsZ | Middle; - Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize); - if (Op.getNode()) - return Op; - } + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + int64_t ReplicatedImm; + unsigned RotateStart, RotateEnd; + MVT VecVT; + if (analyzeBVNForConstantReplication(BVN, ReplicatedImm, RotateStart, + RotateEnd, VecVT, TII)) + return Op; // Fall back to loading it from memory. return SDValue(); @@ -7172,6 +7167,34 @@ return MBB; } +MachineBasicBlock *SystemZTargetLowering::emitFPScalarImm( + MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + + APFloat Imm = MI.getOperand(1).getFPImm()->getValueAPF(); + assert(!Imm.isZero() && !Imm.isNegZero() && "Expected non-zero FP immediate"); + unsigned Start, End; + bool Success = analyzeFPImm(Imm, Start, End, TII); + assert(Success && "Can't build FP immediate."); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SubRegIdx = (MRI->getRegClass(DstReg) == &SystemZ::FP32BitRegClass ? + SystemZ::subreg_h32 : SystemZ::subreg_h64); + unsigned VReg = MRI->createVirtualRegister(&SystemZ::VF128BitRegClass); + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*MBB, MI, DL, TII->get(SystemZ::VGMG), VReg) + .addImm(Start) + .addImm(End); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) + .addReg(VReg, RegState::Kill, SubRegIdx); + + MI.eraseFromParent(); + return MBB; +} + MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { switch (MI.getOpcode()) { @@ -7436,6 +7459,10 @@ case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, MBB); + case SystemZ::FP32ScalarImmPseudo: + case SystemZ::FP64ScalarImmPseudo: + return emitFPScalarImm(MI, MBB); + default: llvm_unreachable("Unexpected instr type to insert"); } Index: lib/Target/SystemZ/SystemZInstrFP.td =================================================================== --- lib/Target/SystemZ/SystemZInstrFP.td +++ lib/Target/SystemZ/SystemZInstrFP.td @@ -41,6 +41,15 @@ def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>; } +// Load scalar floating-point immediate with a VGM. +let isAsCheapAsAMove = 1, isMoveImm = 1, usesCustomInserter = 1, + hasNoSchedulingInfo = 1 in { + def FP32ScalarImmPseudo : Pseudo<(outs FP32:$R1), (ins FP32:$Imm), + [(set FP32:$R1, (fpimm:$Imm))]>; + def FP64ScalarImmPseudo : Pseudo<(outs FP64:$R1), (ins FP64:$Imm), + [(set FP64:$R1, (fpimm:$Imm))]>; +} + // Moves between two floating-point registers. def LER : UnaryRR <"ler", 0x38, null_frag, FP32, FP32>; def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>;