diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -143,6 +144,21 @@ void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned char OpFlags) const; + // Optimization methods. + + // Helper function to check if a reg def is an MI with a given opcode and + // returns it if so. + MachineInstr *findMIFromReg(unsigned Reg, unsigned Opc, + MachineIRBuilder &MIB) const { + auto *Def = MIB.getMRI()->getVRegDef(Reg); + if (!Def || Def->getOpcode() != Opc) + return nullptr; + return Def; + } + + bool tryOptVectorShuffle(MachineInstr &I) const; + bool tryOptVectorDup(MachineInstr &MI) const; + const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; @@ -2325,8 +2341,93 @@ return &*InsElt; } +bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, + // %zerovec(<2 x s32>) + // + // We use the regbank of the scalar to determine which kind of dup to use. + MachineIRBuilder MIB(I); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + using namespace TargetOpcode; + using namespace MIPatternMatch; + + // Begin matching the insert. + auto *InsMI = + findMIFromReg(I.getOperand(1).getReg(), G_INSERT_VECTOR_ELT, MIB); + if (!InsMI) + return false; + // Match the undef vector operand. + auto *UndefMI = + findMIFromReg(InsMI->getOperand(1).getReg(), G_IMPLICIT_DEF, MIB); + if (!UndefMI) + return false; + // Match the scalar being splatted. + unsigned ScalarReg = InsMI->getOperand(2).getReg(); + const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI); + // Match the index constant 0. + int64_t Index = 0; + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index) + return false; + + // The shuffle's second operand doesn't matter if the mask is all zero. + auto *ZeroVec = findMIFromReg(I.getOperand(3).getReg(), G_BUILD_VECTOR, MIB); + if (!ZeroVec) + return false; + int64_t Zero = 0; + if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero) + return false; + for (unsigned i = 1; i < ZeroVec->getNumOperands() - 1; ++i) { + if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg()) + return false; // This wasn't an all zeros vector. + } + + // We're done, now find out what kind of splat we need. + LLT VecTy = MRI.getType(I.getOperand(0).getReg()); + LLT EltTy = VecTy.getElementType(); + if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) + return false; + bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; + static const unsigned OpcTable[2][2] = { + {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr}, + {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}}; + unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64]; + + // For FP splats, we need to widen the scalar reg via undef too. + if (IsFP) { + MachineInstr *Widen = emitScalarToVector( + EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB); + if (!Widen) + return false; + ScalarReg = Widen->getOperand(0).getReg(); + } + auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg}); + if (IsFP) + Dup.addImm(0); + constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const { + if (TM.getOptLevel() == CodeGenOpt::None) + return false; + if (tryOptVectorDup(I)) + return true; + return false; +} + bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) const { + if (tryOptVectorShuffle(I)) + return true; const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); unsigned Src1Reg = I.getOperand(1).getReg(); const LLT Src1Ty = MRI.getType(Src1Reg); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir @@ -0,0 +1,110 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -O1 -verify-machineinstrs %s -o - | FileCheck %s +--- +name: splat_4xi32 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + + ; CHECK-LABEL: name: splat_4xi32 + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK: [[DUPv4i32gpr:%[0-9]+]]:fpr128 = DUPv4i32gpr [[COPY]] + ; CHECK: $q0 = COPY [[DUPv4i32gpr]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:gpr(s32) = COPY $w0 + %2:fpr(<4 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %5:fpr(<4 x s32>) = G_BUILD_VECTOR %3(s32), %3(s32), %3(s32), %3(s32) + %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, %5(<4 x s32>) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: splat_2xi64 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: splat_2xi64 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[DUPv2i64gpr:%[0-9]+]]:fpr128 = DUPv2i64gpr [[COPY]] + ; CHECK: $q0 = COPY [[DUPv2i64gpr]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:gpr(s64) = COPY $x0 + %2:fpr(<2 x s64>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %5:fpr(<2 x s32>) = G_BUILD_VECTOR %3(s32), %3(s32) + %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) + %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, %5(<2 x s32>) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: splat_4xf32 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $s0 + + ; CHECK-LABEL: name: splat_4xf32 + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub + ; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[INSERT_SUBREG]], 0 + ; CHECK: $q0 = COPY [[DUPv4i32lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s32) = COPY $s0 + %2:fpr(<4 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %5:fpr(<4 x s32>) = G_BUILD_VECTOR %3(s32), %3(s32), %3(s32), %3(s32) + %1:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, %5(<4 x s32>) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: splat_2xf64 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $d0 + + ; CHECK-LABEL: name: splat_2xf64 + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub + ; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[INSERT_SUBREG]], 0 + ; CHECK: $q0 = COPY [[DUPv2i64lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s64) = COPY $d0 + %2:fpr(<2 x s64>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %5:fpr(<2 x s32>) = G_BUILD_VECTOR %3(s32), %3(s32) + %1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32) + %4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, %5(<2 x s32>) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +...