Index: llvm/lib/Target/AArch64/AArch64Combine.td =================================================================== --- llvm/lib/Target/AArch64/AArch64Combine.td +++ llvm/lib/Target/AArch64/AArch64Combine.td @@ -135,13 +135,22 @@ (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }]) >; +def build_vector_to_dup : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_BUILD_VECTOR):$root, + [{ return matchBuildVectorToDup(*${root}, MRI); }]), + (apply [{ return applyBuildVectorToDup(*${root}, MRI, B); }]) +>; + +def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. def AArch64PostLegalizerLoweringHelper : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper", [shuffle_vector_lowering, vashr_vlshr_imm, - icmp_lowering]> { + icmp_lowering, build_vector_lowering]> { let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule"; } Index: llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -34,6 +34,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -145,6 +146,16 @@ Register EltReg, unsigned LaneIdx, const RegisterBank &RB, MachineIRBuilder &MIRBuilder) const; + + /// Emit a sequence of instructions representing a constant \p CV for a + /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) + /// + /// \returns the last instruction in the sequence on success, and nullptr + /// otherwise. + MachineInstr *emitConstantVector(Register Dst, Constant *CV, + MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI) const; + bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, MachineRegisterInfo &MRI) const; @@ -1659,6 +1670,16 @@ assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); MachineInstr *OpMI = MRI.getVRegDef(Reg); assert(OpMI && "Expected to find a vreg def for vector shift operand"); + + // We may combine these to G_DUP. + if (OpMI->getOpcode() == AArch64::G_DUP) { + auto VRegAndVal = + getConstantVRegValWithLookThrough(OpMI->getOperand(1).getReg(), MRI); + if (!VRegAndVal) + return None; + return VRegAndVal->Value.getSExtValue(); + } + if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) return None; @@ -2125,6 +2146,25 @@ MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { + case AArch64::G_DUP: { + // Before selecting a DUP instruction, check if it is better selected as a + // MOV or load from a constant pool. + Register Src = I.getOperand(1).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI); + if (!ValAndVReg) + return false; + LLVMContext &Ctx = MF.getFunction().getContext(); + Register Dst = I.getOperand(0).getReg(); + auto *CV = ConstantDataVector::getSplat( + MRI.getType(Dst).getNumElements(), + ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()), + ValAndVReg->Value)); + MachineIRBuilder MIRBuilder(I); + if (!emitConstantVector(Dst, CV, MIRBuilder, MRI)) + return false; + I.eraseFromParent(); + return true; + } case TargetOpcode::G_BR: { // If the branch jumps to the fallthrough block, don't bother emitting it. // Only do this for -O0 for a good code size improvement, because when @@ -4811,6 +4851,43 @@ return true; } +MachineInstr* AArch64InstructionSelector::emitConstantVector( + Register Dst, Constant *CV, MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI) const { + LLT DstTy = MRI.getType(Dst); + unsigned DstSize = DstTy.getSizeInBits(); + if (CV->isNullValue()) { + if (DstSize == 128) { + auto Mov = + MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); + constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); + return &*Mov; + } + + if (DstSize == 64) { + auto Mov = + MIRBuilder + .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) + .addImm(0); + auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) + .addReg(Mov.getReg(0), 0, AArch64::dsub); + RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); + return &*Copy; + } + } + + auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); + if (!CPLoad) { + LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); + return nullptr; + } + + auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); + RBI.constrainGenericRegister( + Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); + return &*Copy; +} + bool AArch64InstructionSelector::tryOptConstantBuildVec( MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); @@ -4837,33 +4914,8 @@ } Constant *CV = ConstantVector::get(Csts); MachineIRBuilder MIB(I); - if (CV->isNullValue()) { - // Until the importer can support immAllZerosV in pattern leaf nodes, - // select a zero move manually here. - Register DstReg = I.getOperand(0).getReg(); - if (DstSize == 128) { - auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); - } else if (DstSize == 64) { - auto Mov = - MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) - .addImm(0); - MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(Mov.getReg(0), 0, AArch64::dsub); - I.eraseFromParent(); - return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI); - } - } - auto *CPLoad = emitLoadFromConstantPool(CV, MIB); - if (!CPLoad) { - LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); + if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) return false; - } - MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0)); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), - *MRI.getRegClass(CPLoad->getOperand(0).getReg()), - MRI); I.eraseFromParent(); return true; } Index: llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -471,10 +471,16 @@ int64_t &Cnt) { assert(Ty.isVector() && "vector shift count is not a vector type"); MachineInstr *MI = MRI.getVRegDef(Reg); - auto Cst = getBuildVectorConstantSplat(*MI, MRI); - if (!Cst) + if (MI->getOpcode() == AArch64::G_DUP) { + auto VRegAndVal = + getConstantVRegValWithLookThrough(MI->getOperand(1).getReg(), MRI); + if (!VRegAndVal) + return false; + Cnt = VRegAndVal->Value.getSExtValue(); + } else if (auto Cst = getBuildVectorConstantSplat(*MI, MRI)) { + Cnt = *Cst; + } else return false; - Cnt = *Cst; int64_t ElementBits = Ty.getScalarSizeInBits(); return Cnt >= 1 && Cnt <= ElementBits; } @@ -696,6 +702,23 @@ return true; } +static bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + auto Reg = MI.getOperand(1).getReg(); + return all_of( + make_range(MI.operands_begin() + 2, MI.operands_end()), + [&Reg](const MachineOperand &Op) { return Op.getReg() == Reg; }); +} + +static bool applyBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(AArch64::G_DUP, {MI.getOperand(0).getReg()}, + {MI.getOperand(1).getReg()}); + MI.eraseFromParent(); + return true; +} + #define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGILowering.inc" #undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS Index: llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s + +... +--- +name: same_reg +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0 + ; CHECK-LABEL: name: same_reg + ; CHECK: liveins: $d0 + ; CHECK: %r:_(s8) = G_IMPLICIT_DEF + ; CHECK: %build_vector:_(<8 x s8>) = G_DUP %r(s8) + ; CHECK: $d0 = COPY %build_vector(<8 x s8>) + ; CHECK: RET_ReallyLR implicit $d0 + %r:_(s8) = G_IMPLICIT_DEF + %build_vector:_(<8 x s8>) = G_BUILD_VECTOR %r, %r, %r, %r, %r, %r, %r, %r + $d0 = COPY %build_vector(<8 x s8>) + RET_ReallyLR implicit $d0 + +... +--- +name: dont_combine_different_reg +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0, $w0, $w1 + ; CHECK-LABEL: name: dont_combine_different_reg + ; CHECK: liveins: $d0, $w0, $w1 + ; CHECK: %r:_(s32) = COPY $w0 + ; CHECK: %q:_(s32) = COPY $w1 + ; CHECK: %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r(s32), %q(s32) + ; CHECK: $d0 = COPY %build_vector(<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %r:_(s32) = COPY $w0 + %q:_(s32) = COPY $w1 + %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r, %q + $d0 = COPY %build_vector(<2 x s32>) + RET_ReallyLR implicit $d0 Index: llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir @@ -84,8 +84,8 @@ ; CHECK: liveins: $d0, $d1 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 40 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[DUP:%[0-9]+]]:_(<4 x s32>) = G_DUP [[C]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[DUP]](<4 x s32>) ; CHECK: $q0 = COPY [[LSHR]](<4 x s32>) ; CHECK: RET_ReallyLR implicit $q0 %0:_(<4 x s32>) = COPY $q0 @@ -108,8 +108,8 @@ ; CHECK: liveins: $d0, $d1 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[DUP:%[0-9]+]]:_(<4 x s32>) = G_DUP [[C]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[DUP]](<4 x s32>) ; CHECK: $q0 = COPY [[LSHR]](<4 x s32>) ; CHECK: RET_ReallyLR implicit $q0 %0:_(<4 x s32>) = COPY $q0 Index: llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir @@ -379,3 +379,60 @@ RET_ReallyLR implicit $q0 ... +--- +name: zero_v4s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: zero_v4s32 + ; CHECK: liveins: $w0 + ; CHECK: %dup:fpr128 = MOVIv2d_ns 0 + ; CHECK: $q0 = COPY %dup + ; CHECK: RET_ReallyLR implicit $q0 + %zero:gpr(s32) = G_CONSTANT i32 0 + %dup:fpr(<4 x s32>) = G_DUP %zero(s32) + $q0 = COPY %dup(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: zero_v8s8 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: zero_v8s8 + ; CHECK: liveins: $w0 + ; CHECK: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0 + ; CHECK: %dup:fpr64 = COPY [[MOVIv2d_ns]].dsub + ; CHECK: $d0 = COPY %dup + ; CHECK: RET_ReallyLR implicit $d0 + %zero:gpr(s8) = G_CONSTANT i8 0 + %dup:fpr(<8 x s8>) = G_DUP %zero(s8) + $d0 = COPY %dup(<8 x s8>) + RET_ReallyLR implicit $d0 + +... +--- +name: one_v4s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: one_v4s32 + ; CHECK: liveins: $w0 + ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0 + ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0 + ; CHECK: $q0 = COPY [[LDRQui]] + ; CHECK: RET_ReallyLR implicit $q0 + %zero:gpr(s32) = G_CONSTANT i32 1 + %dup:fpr(<4 x s32>) = G_DUP %zero(s32) + $q0 = COPY %dup(<4 x s32>) + RET_ReallyLR implicit $q0 Index: llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir @@ -572,3 +572,38 @@ $q0 = COPY %2(<16 x s8>) RET_ReallyLR implicit $q0 ... +--- +name: shl_v2i32_imm_dup +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: gpr } + - { id: 3, class: fpr } +liveins: + - { reg: '$d0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.1: + liveins: $d0 + + ; Should still be able to select immediate forms using a G_DUP from a + ; constant. + + ; CHECK-LABEL: name: shl_v2i32_imm_dup + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[SHLv2i32_shift:%[0-9]+]]:fpr64 = SHLv2i32_shift [[COPY]], 24 + ; CHECK: $d0 = COPY [[SHLv2i32_shift]] + ; CHECK: RET_ReallyLR implicit $d0 + %0:fpr(<2 x s32>) = COPY $d0 + %2:gpr(s32) = G_CONSTANT i32 24 + %1:fpr(<2 x s32>) = G_DUP %2(s32) + %3:fpr(<2 x s32>) = G_SHL %0, %1(<2 x s32>) + $d0 = COPY %3(<2 x s32>) + RET_ReallyLR implicit $d0