diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -83,6 +83,15 @@ (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]) >; +def form_duplane_matchdata : + GIDefMatchData<"std::pair">; +def form_duplane : GICombineRule < + (defs root:$root, form_duplane_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }]) +>; + def adjust_icmp_imm_matchdata : GIDefMatchData<"std::pair">; def adjust_icmp_imm : GICombineRule < @@ -108,7 +117,7 @@ def AArch64PostLegalizerLoweringHelper : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper", [shuffle_vector_pseudos, vashr_vlshr_imm, - icmp_lowering]> { + icmp_lowering, form_duplane]> { let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -88,6 +88,29 @@ let InOperandList = (ins type1:$lane); let hasSideEffects = 0; } + +// Represents a lane duplicate operation. +def G_DUPLANE8 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} + // Represents a trn1 instruction. Produced post-legalization from // G_SHUFFLE_VECTORs with appropriate masks. def G_TRN1 : AArch64GenericInstruction { @@ -131,6 +154,10 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -136,8 +136,6 @@ bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool tryOptShuffleDupLane(MachineInstr &I, LLT DstTy, LLT SrcTy, - ArrayRef Mask, MachineRegisterInfo &MRI) const; bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -4319,67 +4317,6 @@ return nullptr; } -bool AArch64InstructionSelector::tryOptShuffleDupLane( - MachineInstr &I, LLT DstTy, LLT SrcTy, ArrayRef Mask, - MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - - // We assume that scalar->vector splats have been been handled in the - // post-legalizer combiner to G_DUP. However splats of a source vector's - // lane don't fit that pattern, detect it here: - // %res = G_SHUFFLE_VECTOR %src:, undef, splat(lane-idx) - // => - // %res = DUPv[N][Ty]lane %src, lane-idx - // FIXME: this case should be covered by re-implementing the perfect shuffle - // codegen mechanism. - - auto LaneIdx = getSplatIndex(I); - if (!LaneIdx) - return false; - - // The lane idx should be within the first source vector. - if (*LaneIdx >= SrcTy.getNumElements()) - return false; - - if (DstTy != SrcTy) - return false; - - LLT ScalarTy = SrcTy.getElementType(); - unsigned ScalarSize = ScalarTy.getSizeInBits(); - - unsigned Opc = 0; - switch (SrcTy.getNumElements()) { - case 2: - if (ScalarSize == 64) - Opc = AArch64::DUPv2i64lane; - break; - case 4: - if (ScalarSize == 32) - Opc = AArch64::DUPv4i32lane; - break; - case 8: - if (ScalarSize == 16) - Opc = AArch64::DUPv8i16lane; - break; - case 16: - if (ScalarSize == 8) - Opc = AArch64::DUPv16i8lane; - break; - default: - break; - } - if (!Opc) - return false; - - MachineIRBuilder MIB(I); - auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, - {I.getOperand(1).getReg()}) - .addImm(*LaneIdx); - constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); @@ -4401,9 +4338,6 @@ return false; } - if (tryOptShuffleDupLane(I, DstTy, Src1Ty, Mask, MRI)) - return true; - unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector CstIdxs; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -550,6 +550,67 @@ return true; } +bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, + std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Src1Reg = MI.getOperand(1).getReg(); + const LLT SrcTy = MRI.getType(Src1Reg); + const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + auto LaneIdx = getSplatIndex(MI); + if (!LaneIdx) + return false; + + // The lane idx should be within the first source vector. + if (*LaneIdx >= SrcTy.getNumElements()) + return false; + + if (DstTy != SrcTy) + return false; + + LLT ScalarTy = SrcTy.getElementType(); + unsigned ScalarSize = ScalarTy.getSizeInBits(); + + unsigned Opc = 0; + switch (SrcTy.getNumElements()) { + case 2: + if (ScalarSize == 64) + Opc = AArch64::G_DUPLANE64; + break; + case 4: + if (ScalarSize == 32) + Opc = AArch64::G_DUPLANE32; + break; + case 8: + if (ScalarSize == 16) + Opc = AArch64::G_DUPLANE16; + break; + case 16: + if (ScalarSize == 8) + Opc = AArch64::G_DUPLANE8; + break; + default: + break; + } + if (!Opc) + return false; + + MatchInfo.first = Opc; + MatchInfo.second = *LaneIdx; + return true; +} + +bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + B.setInstrAndDebugLoc(MI); + auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second); + B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, + {MI.getOperand(1).getReg(), Lane}); + MI.eraseFromParent(); + return true; +} + #define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGILowering.inc" #undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir @@ -0,0 +1,120 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64 -global-isel -start-before=aarch64-postlegalizer-lowering -stop-after=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=SELECTED + +--- +name: duplane64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0 + + ; CHECK-LABEL: name: duplane64 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[DUPLANE64_:%[0-9]+]]:_(<2 x s64>) = G_DUPLANE64 [[COPY]], [[C]](s64) + ; CHECK: $q0 = COPY [[DUPLANE64_]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + ; SELECTED-LABEL: name: duplane64 + ; SELECTED: liveins: $q0 + ; SELECTED: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; SELECTED: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY]], 0 + ; SELECTED: $q0 = COPY [[DUPv2i64lane]] + ; SELECTED: RET_ReallyLR implicit $q0 + %1:_(<2 x s64>) = COPY $q0 + %2:_(<2 x s64>) = G_IMPLICIT_DEF + %4:_(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, shufflemask(0, 0) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane32 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0 + + ; CHECK-LABEL: name: duplane32 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[DUPLANE32_:%[0-9]+]]:_(<4 x s32>) = G_DUPLANE32 [[COPY]], [[C]](s64) + ; CHECK: $q0 = COPY [[DUPLANE32_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + ; SELECTED-LABEL: name: duplane32 + ; SELECTED: liveins: $q0 + ; SELECTED: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; SELECTED: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY]], 0 + ; SELECTED: $q0 = COPY [[DUPv4i32lane]] + ; SELECTED: RET_ReallyLR implicit $q0 + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = G_IMPLICIT_DEF + %4:_(<4 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %2, shufflemask(0, 0, 0, 0) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane16 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0 + + ; CHECK-LABEL: name: duplane16 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[DUPLANE16_:%[0-9]+]]:_(<8 x s16>) = G_DUPLANE16 [[COPY]], [[C]](s64) + ; CHECK: $q0 = COPY [[DUPLANE16_]](<8 x s16>) + ; CHECK: RET_ReallyLR implicit $q0 + ; SELECTED-LABEL: name: duplane16 + ; SELECTED: liveins: $q0 + ; SELECTED: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; SELECTED: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY]], 0 + ; SELECTED: $q0 = COPY [[DUPv8i16lane]] + ; SELECTED: RET_ReallyLR implicit $q0 + %1:_(<8 x s16>) = COPY $q0 + %2:_(<8 x s16>) = G_IMPLICIT_DEF + %4:_(<8 x s16>) = G_SHUFFLE_VECTOR %1(<8 x s16>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %4(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane8 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0 + + ; CHECK-LABEL: name: duplane8 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[DUPLANE8_:%[0-9]+]]:_(<16 x s8>) = G_DUPLANE8 [[COPY]], [[C]](s64) + ; CHECK: $q0 = COPY [[DUPLANE8_]](<16 x s8>) + ; CHECK: RET_ReallyLR implicit $q0 + ; SELECTED-LABEL: name: duplane8 + ; SELECTED: liveins: $q0 + ; SELECTED: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; SELECTED: [[DUPv16i8lane:%[0-9]+]]:fpr128 = DUPv16i8lane [[COPY]], 0 + ; SELECTED: $q0 = COPY [[DUPv16i8lane]] + ; SELECTED: RET_ReallyLR implicit $q0 + %1:_(<16 x s8>) = COPY $q0 + %2:_(<16 x s8>) = G_IMPLICIT_DEF + %4:_(<16 x s8>) = G_SHUFFLE_VECTOR %1(<16 x s8>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %4(<16 x s8>) + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir +++ /dev/null @@ -1,103 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -... ---- -name: duplane_v16i8 -alignment: 4 -legalized: true -regBankSelected: true -tracksRegLiveness: true -liveins: - - { reg: '$q0' } -body: | - bb.1: - liveins: $q0 - - ; CHECK-LABEL: name: duplane_v16i8 - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[DUPv16i8lane:%[0-9]+]]:fpr128 = DUPv16i8lane [[COPY]], 0 - ; CHECK: $q0 = COPY [[DUPv16i8lane]] - ; CHECK: RET_ReallyLR implicit $q0 - %0:fpr(<16 x s8>) = COPY $q0 - %2:fpr(<16 x s8>) = G_IMPLICIT_DEF - %1:fpr(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) - $q0 = COPY %1(<16 x s8>) - RET_ReallyLR implicit $q0 - -... ---- -name: duplane_v8i16 -alignment: 4 -legalized: true -regBankSelected: true -tracksRegLiveness: true -liveins: - - { reg: '$q0' } -body: | - bb.1: - liveins: $q0 - - ; CHECK-LABEL: name: duplane_v8i16 - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY]], 0 - ; CHECK: $q0 = COPY [[DUPv8i16lane]] - ; CHECK: RET_ReallyLR implicit $q0 - %0:fpr(<8 x s16>) = COPY $q0 - %2:fpr(<8 x s16>) = G_IMPLICIT_DEF - %1:fpr(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0) - $q0 = COPY %1(<8 x s16>) - RET_ReallyLR implicit $q0 - -... ---- -name: duplane_v4f32 -alignment: 4 -legalized: true -regBankSelected: true -tracksRegLiveness: true -liveins: - - { reg: '$q0' } -body: | - bb.1: - liveins: $q0 - - ; CHECK-LABEL: name: duplane_v4f32 - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY]], 0 - ; CHECK: $q0 = COPY [[DUPv4i32lane]] - ; CHECK: RET_ReallyLR implicit $q0 - %0:fpr(<4 x s32>) = COPY $q0 - %2:fpr(<4 x s32>) = G_IMPLICIT_DEF - %1:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %2, shufflemask(0, 0, 0, 0) - $q0 = COPY %1(<4 x s32>) - RET_ReallyLR implicit $q0 - -... ---- -name: duplane_v2i64 -alignment: 4 -legalized: true -regBankSelected: true -tracksRegLiveness: true -liveins: - - { reg: '$q0' } -body: | - bb.1: - liveins: $q0 - - ; CHECK-LABEL: name: duplane_v2i64 - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY]], 0 - ; CHECK: $q0 = COPY [[DUPv2i64lane]] - ; CHECK: RET_ReallyLR implicit $q0 - %0:fpr(<2 x s64>) = COPY $q0 - %2:fpr(<2 x s64>) = G_IMPLICIT_DEF - %1:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(0, 0) - $q0 = COPY %1(<2 x s64>) - RET_ReallyLR implicit $q0 - -...