diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -49,9 +49,16 @@ (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) >; +def dup: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDup(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + // Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo // instruction. -def shuffle_vector_pseudos : GICombineGroup<[rev, zip, uzp]>; +def shuffle_vector_pseudos : GICombineGroup<[dup, rev, zip, uzp]>; def AArch64PostLegalizerCombinerHelper : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -74,6 +74,13 @@ let InOperandList = (ins type0:$v1, type0:$v2); } +// Represents a dup instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_DUP: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$lane); +} + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; @@ -81,3 +88,4 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -501,6 +501,7 @@ const MachineInstr &MI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const { switch (MI.getOpcode()) { + case AArch64::G_DUP: case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: case TargetOpcode::G_EXTRACT_VECTOR_ELT: @@ -642,6 +643,16 @@ // Some of the floating-point instructions have mixed GPR and FPR operands: // fine-tune the computed mapping. switch (Opc) { + case AArch64::G_DUP: { + Register ScalarReg = MI.getOperand(1).getReg(); + auto ScalarDef = MRI.getVRegDef(ScalarReg); + if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*ScalarDef, MRI, TRI)) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + break; + } case TargetOpcode::G_TRUNC: { LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -306,8 +306,6 @@ unsigned OpFlags) const; // Optimization methods. - bool tryOptVectorShuffle(MachineInstr &I) const; - bool tryOptVectorDup(MachineInstr &MI) const; bool tryOptSelect(MachineInstr &MI) const; MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, @@ -4211,119 +4209,8 @@ return &*CmpMI; } -bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { - // Try to match a vector splat operation into a dup instruction. - // We're looking for this pattern: - // %scalar:gpr(s64) = COPY $x0 - // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF - // %cst0:gpr(s32) = G_CONSTANT i32 0 - // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) - // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) - // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, - // %zerovec(<2 x s32>) - // - // ...into: - // %splat = DUP %scalar - // We use the regbank of the scalar to determine which kind of dup to use. - MachineIRBuilder MIB(I); - MachineRegisterInfo &MRI = *MIB.getMRI(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - using namespace TargetOpcode; - using namespace MIPatternMatch; - - // Begin matching the insert. - auto *InsMI = - getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI); - if (!InsMI) - return false; - // Match the undef vector operand. - auto *UndefMI = - getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI); - if (!UndefMI) - return false; - // Match the scalar being splatted. - Register ScalarReg = InsMI->getOperand(2).getReg(); - const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI); - // Match the index constant 0. - int64_t Index = 0; - if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) - return false; - - // The shuffle's second operand doesn't matter if the mask is all zero. - ArrayRef Mask = I.getOperand(3).getShuffleMask(); - if (!all_of(Mask, [](int Elem) { return Elem == 0; })) - return false; - - // We're done, now find out what kind of splat we need. - LLT VecTy = MRI.getType(I.getOperand(0).getReg()); - LLT EltTy = VecTy.getElementType(); - if (EltTy.getSizeInBits() < 32) { - LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); - return false; - } - bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; - unsigned Opc = 0; - if (IsFP) { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32lane; - } else { - Opc = AArch64::DUPv4i32lane; - assert(VecTy.getNumElements() == 4); - } - break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64lane; - break; - } - } else { - switch (EltTy.getSizeInBits()) { - case 32: - if (VecTy.getNumElements() == 2) { - Opc = AArch64::DUPv2i32gpr; - } else { - Opc = AArch64::DUPv4i32gpr; - assert(VecTy.getNumElements() == 4); - } - break; - case 64: - assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); - Opc = AArch64::DUPv2i64gpr; - break; - } - } - assert(Opc && "Did not compute an opcode for a dup"); - - // For FP splats, we need to widen the scalar reg via undef too. - if (IsFP) { - MachineInstr *Widen = emitScalarToVector( - EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB); - if (!Widen) - return false; - ScalarReg = Widen->getOperand(0).getReg(); - } - auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg}); - if (IsFP) - Dup.addImm(0); - constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const { - if (TM.getOptLevel() == CodeGenOpt::None) - return false; - if (tryOptVectorDup(I)) - return true; - return false; -} - bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) const { - if (tryOptVectorShuffle(I)) - return true; const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); Register Src1Reg = I.getOperand(1).getReg(); const LLT Src1Ty = MRI.getType(Src1Reg); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -27,6 +28,7 @@ #define DEBUG_TYPE "aarch64-postlegalizer-combiner" using namespace llvm; +using namespace MIPatternMatch; /// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. /// @@ -41,6 +43,29 @@ ShuffleVectorPseudo() {} }; +/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. +/// If \p MI is not a splat, returns None. +static Optional getSplatIndex(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + "Only G_SHUFFLE_VECTOR can have a splat index!"); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); + + // If all elements are undefined, this shuffle can be considered a splat. + // Return 0 for better potential for callers to simplify. + if (FirstDefinedIdx == Mask.end()) + return 0; + + // Make sure all remaining elements are either undef or the same + // as the first non-undef value. + int SplatValue = *FirstDefinedIdx; + if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), + [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) + return None; + + return SplatValue; +} + /// Check if a vector shuffle corresponds to a REV instruction with the /// specified blocksize. static bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, @@ -170,6 +195,53 @@ return true; } +static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + auto Lane = getSplatIndex(MI); + if (!Lane || *Lane != 0) + return false; + + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) + // + // ...into: + // %splat = G_DUP %scalar + + // Begin matching the insert. + auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, + MI.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + + // Match the undef vector operand. + if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, + InsMI->getOperand(1).getReg(), MRI)) + return false; + + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + Register Dst = MI.getOperand(0).getReg(); + if (MRI.getType(Dst).getScalarSizeInBits() < 32) { + LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); + return false; + } + + MatchInfo = + ShuffleVectorPseudo(AArch64::G_DUP, Dst, {InsMI->getOperand(2).getReg()}); + return true; +} + /// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. /// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. static bool applyShuffleVectorPseudo(MachineInstr &MI, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-shuffle-splat.mir rename from llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir rename to llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-shuffle-splat.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-shuffle-splat.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -O1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + --- name: splat_4xi32 alignment: 4 @@ -12,9 +13,9 @@ ; CHECK-LABEL: name: splat_4xi32 ; CHECK: liveins: $w0 - ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 - ; CHECK: [[DUPv4i32gpr:%[0-9]+]]:fpr128 = DUPv4i32gpr [[COPY]] - ; CHECK: $q0 = COPY [[DUPv4i32gpr]] + ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $q0 = COPY [[DUP]](<4 x s32>) ; CHECK: RET_ReallyLR implicit $q0 %0:gpr(s32) = COPY $w0 %2:fpr(<4 x s32>) = G_IMPLICIT_DEF @@ -37,9 +38,9 @@ ; CHECK-LABEL: name: splat_2xi64 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK: [[DUPv2i64gpr:%[0-9]+]]:fpr128 = DUPv2i64gpr [[COPY]] - ; CHECK: $q0 = COPY [[DUPv2i64gpr]] + ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) ; CHECK: RET_ReallyLR implicit $q0 %0:gpr(s64) = COPY $x0 %2:fpr(<2 x s64>) = G_IMPLICIT_DEF @@ -62,9 +63,9 @@ ; CHECK-LABEL: name: splat_2xi32 ; CHECK: liveins: $w0 - ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 - ; CHECK: [[DUPv2i32gpr:%[0-9]+]]:fpr64 = DUPv2i32gpr [[COPY]] - ; CHECK: $d0 = COPY [[DUPv2i32gpr]] + ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $d0 = COPY [[DUP]](<2 x s32>) ; CHECK: RET_ReallyLR implicit $d0 %0:gpr(s32) = COPY $w0 %2:fpr(<2 x s32>) = G_IMPLICIT_DEF @@ -87,11 +88,9 @@ ; CHECK-LABEL: name: splat_4xf32 ; CHECK: liveins: $s0 - ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub - ; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[INSERT_SUBREG]], 0 - ; CHECK: $q0 = COPY [[DUPv4i32lane]] + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $q0 = COPY [[DUP]](<4 x s32>) ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s32) = COPY $s0 %2:fpr(<4 x s32>) = G_IMPLICIT_DEF @@ -114,11 +113,9 @@ ; CHECK-LABEL: name: splat_2xf64 ; CHECK: liveins: $d0 - ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub - ; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[INSERT_SUBREG]], 0 - ; CHECK: $q0 = COPY [[DUPv2i64lane]] + ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s64) = COPY $d0 %2:fpr(<2 x s64>) = G_IMPLICIT_DEF @@ -141,11 +138,9 @@ ; CHECK-LABEL: name: splat_2xf32 ; CHECK: liveins: $s0 - ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub - ; CHECK: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 - ; CHECK: $d0 = COPY [[DUPv2i32lane]] + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $d0 = COPY [[DUP]](<2 x s32>) ; CHECK: RET_ReallyLR implicit $d0 %0:fpr(s32) = COPY $s0 %2:fpr(<2 x s32>) = G_IMPLICIT_DEF @@ -170,11 +165,9 @@ ; These copies shouldn't get in the way of matching the dup pattern. ; CHECK-LABEL: name: splat_2xf64_copies ; CHECK: liveins: $d0 - ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub - ; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[INSERT_SUBREG]], 0 - ; CHECK: $q0 = COPY [[DUPv2i64lane]] + ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s64) = COPY $d0 %2:fpr(<2 x s64>) = G_IMPLICIT_DEF @@ -199,14 +192,12 @@ ; Make sure that we don't do the optimization when it's not all zeroes. ; CHECK-LABEL: name: not_all_zeros ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[DEF]], 0, [[COPY]] - ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 - ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[INSvi64gpr]], %subreg.qsub0, [[DEF]], %subreg.qsub1 - ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]] - ; CHECK: $q0 = COPY [[TBLv16i8Two]] + ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0 + ; CHECK: [[DEF:%[0-9]+]]:fpr(<2 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[IVEC:%[0-9]+]]:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s32) + ; CHECK: [[SHUF:%[0-9]+]]:fpr(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 1) + ; CHECK: $q0 = COPY [[SHUF]](<2 x s64>) ; CHECK: RET_ReallyLR implicit $q0 %0:gpr(s64) = COPY $x0 %2:fpr(<2 x s64>) = G_IMPLICIT_DEF @@ -215,3 +206,89 @@ %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 1) $q0 = COPY %4(<2 x s64>) RET_ReallyLR implicit $q0 + +... +--- +name: all_undef +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + ; If all the elements are undefined, we consider it a splat. In this case, + ; we can choose 0 as our index. + ; + ; We should get a G_DUP here. + ; + ; CHECK-LABEL: name: all_undef + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:gpr(s64) = COPY $x0 + %2:fpr(<2 x s64>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) + %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(-1, -1) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: one_undef +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $s0 + ; Make sure we can skip past undef values. + ; + ; We should get a G_DUP here. + ; + ; CHECK-LABEL: name: one_undef + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $q0 = COPY [[DUP]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s32) = COPY $s0 + %2:fpr(<4 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, -1, 0, 0) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: not_all_zeros_with_undefs +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $s0 + ; Check a non-splat mask with an undef value. We shouldn't get a G_DUP here. + ; + ; CHECK-LABEL: name: not_all_zeros_with_undefs + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DEF:%[0-9]+]]:fpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[IVEC:%[0-9]+]]:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK: [[SHUF:%[0-9]+]]:fpr(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 3) + ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s32) = COPY $s0 + %2:fpr(<4 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(-1, 0, 0, 3) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir @@ -0,0 +1,154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s +# +# Verify register banks for G_DUP. +# + +... +--- +name: v4s32_gpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + + ; CHECK-LABEL: name: v4s32_gpr + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $q0 = COPY [[DUP]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(s32) = COPY $w0 + %4:_(<4 x s32>) = G_DUP %0(s32) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: v4s64_gpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + + ; CHECK-LABEL: name: v4s64_gpr + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(s64) = COPY $x0 + %4:_(<2 x s64>) = G_DUP %0(s64) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: v2s32_gpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + + ; CHECK-LABEL: name: v2s32_gpr + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $d0 = COPY [[DUP]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(s32) = COPY $w0 + %4:_(<2 x s32>) = G_DUP %0(s32) + $d0 = COPY %4(<2 x s32>) + RET_ReallyLR implicit $d0 + +... +--- +name: v4s32_fpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $s0 + + ; CHECK-LABEL: name: v4s32_fpr + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $q0 = COPY [[DUP]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(s32) = COPY $s0 + %4:_(<4 x s32>) = G_DUP %0(s32) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: v2s64_fpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $d0 + + ; CHECK-LABEL: name: v2s64_fpr + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(s64) = COPY $d0 + %4:_(<2 x s64>) = G_DUP %0(s64) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: v2s32_fpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $s0 + + ; CHECK-LABEL: name: v2s32_fpr + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32) + ; CHECK: $d0 = COPY [[DUP]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(s32) = COPY $s0 + %4:_(<2 x s32>) = G_DUP %0(s32) + $d0 = COPY %4(<2 x s32>) + RET_ReallyLR implicit $d0 + +... +--- +name: v2s64_fpr_copy +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $d0 + + ; CHECK-LABEL: name: v2s64_fpr_copy + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0 + ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64) + ; CHECK: $q0 = COPY [[DUP]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(s64) = COPY $d0 + %6:_(<2 x s64>) = G_DUP %0(s64) + $q0 = COPY %6(<2 x s64>) + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir rename from llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir rename to llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir @@ -1,5 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -O1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +... --- name: splat_4xi32 alignment: 4 @@ -7,7 +9,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $w0 ; CHECK-LABEL: name: splat_4xi32 @@ -17,10 +19,7 @@ ; CHECK: $q0 = COPY [[DUPv4i32gpr]] ; CHECK: RET_ReallyLR implicit $q0 %0:gpr(s32) = COPY $w0 - %2:fpr(<4 x s32>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) - %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0) + %4:fpr(<4 x s32>) = G_DUP %0(s32) $q0 = COPY %4(<4 x s32>) RET_ReallyLR implicit $q0 @@ -32,7 +31,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $x0 ; CHECK-LABEL: name: splat_2xi64 @@ -42,10 +41,7 @@ ; CHECK: $q0 = COPY [[DUPv2i64gpr]] ; CHECK: RET_ReallyLR implicit $q0 %0:gpr(s64) = COPY $x0 - %2:fpr(<2 x s64>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) - %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0) + %4:fpr(<2 x s64>) = G_DUP %0(s64) $q0 = COPY %4(<2 x s64>) RET_ReallyLR implicit $q0 @@ -57,7 +53,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $w0 ; CHECK-LABEL: name: splat_2xi32 @@ -67,10 +63,7 @@ ; CHECK: $d0 = COPY [[DUPv2i32gpr]] ; CHECK: RET_ReallyLR implicit $d0 %0:gpr(s32) = COPY $w0 - %2:fpr(<2 x s32>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) - %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0) + %4:fpr(<2 x s32>) = G_DUP %0(s32) $d0 = COPY %4(<2 x s32>) RET_ReallyLR implicit $d0 @@ -82,7 +75,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $s0 ; CHECK-LABEL: name: splat_4xf32 @@ -94,10 +87,7 @@ ; CHECK: $q0 = COPY [[DUPv4i32lane]] ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s32) = COPY $s0 - %2:fpr(<4 x s32>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) - %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0) + %4:fpr(<4 x s32>) = G_DUP %0(s32) $q0 = COPY %4(<4 x s32>) RET_ReallyLR implicit $q0 @@ -109,7 +99,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $d0 ; CHECK-LABEL: name: splat_2xf64 @@ -121,10 +111,7 @@ ; CHECK: $q0 = COPY [[DUPv2i64lane]] ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s64) = COPY $d0 - %2:fpr(<2 x s64>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) - %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0) + %4:fpr(<2 x s64>) = G_DUP %0(s64) $q0 = COPY %4(<2 x s64>) RET_ReallyLR implicit $q0 @@ -136,7 +123,7 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $s0 ; CHECK-LABEL: name: splat_2xf32 @@ -148,10 +135,7 @@ ; CHECK: $d0 = COPY [[DUPv2i32lane]] ; CHECK: RET_ReallyLR implicit $d0 %0:fpr(s32) = COPY $s0 - %2:fpr(<2 x s32>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) - %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0) + %4:fpr(<2 x s32>) = G_DUP %0(s32) $d0 = COPY %4(<2 x s32>) RET_ReallyLR implicit $d0 @@ -163,11 +147,9 @@ regBankSelected: true tracksRegLiveness: true body: | - bb.1.entry: + bb.0.entry: liveins: $d0 - ; This test is exactly the same as splat_2xf64, except it adds two copies. - ; These copies shouldn't get in the way of matching the dup pattern. ; CHECK-LABEL: name: splat_2xf64_copies ; CHECK: liveins: $d0 ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 @@ -177,41 +159,6 @@ ; CHECK: $q0 = COPY [[DUPv2i64lane]] ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(s64) = COPY $d0 - %2:fpr(<2 x s64>) = G_IMPLICIT_DEF - %6:fpr(<2 x s64>) = COPY %2 - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %6, %0(s64), %3(s32) - %7:fpr(<2 x s64>) = COPY %1 - %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %7(<2 x s64>), %2, shufflemask(0, 0) - $q0 = COPY %4(<2 x s64>) - RET_ReallyLR implicit $q0 - -... ---- -name: not_all_zeros -alignment: 4 -legalized: true -regBankSelected: true -tracksRegLiveness: true -body: | - bb.1.entry: - liveins: $x0 - ; Make sure that we don't do the optimization when it's not all zeroes. - ; CHECK-LABEL: name: not_all_zeros - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 - ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[DEF]], 0, [[COPY]] - ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 - ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[INSvi64gpr]], %subreg.qsub0, [[DEF]], %subreg.qsub1 - ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]] - ; CHECK: $q0 = COPY [[TBLv16i8Two]] - ; CHECK: RET_ReallyLR implicit $q0 - %0:gpr(s64) = COPY $x0 - %2:fpr(<2 x s64>) = G_IMPLICIT_DEF - %3:gpr(s32) = G_CONSTANT i32 0 - %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) - %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 1) - $q0 = COPY %4(<2 x s64>) + %6:fpr(<2 x s64>) = G_DUP %0(s64) + $q0 = COPY %6(<2 x s64>) RET_ReallyLR implicit $q0