diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -661,6 +661,8 @@ case 2: if (ScalarSize == 64) Opc = AArch64::G_DUPLANE64; + else if (ScalarSize == 32) + Opc = AArch64::G_DUPLANE32; break; case 4: if (ScalarSize == 32) @@ -688,10 +690,24 @@ bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, std::pair &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Src1Reg = MI.getOperand(1).getReg(); + const LLT SrcTy = MRI.getType(Src1Reg); + const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + B.setInstrAndDebugLoc(MI); auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second); - B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, - {MI.getOperand(1).getReg(), Lane}); + + Register DupSrc = MI.getOperand(1).getReg(); + // For types like <2 x s32>, we can use G_DUPLANE32, with a <4 x s32> source. + // To do this, we can use a G_CONCAT_VECTORS to do the widening. + if (SrcTy == LLT::vector(2, LLT::scalar(32))) { + assert(DstTy.getNumElements() == 2 && "Unexpected dest elements"); + auto Undef = B.buildUndef(SrcTy); + DupSrc = B.buildConcatVectors(SrcTy.changeNumElements(4), + {Src1Reg, Undef.getReg(0)}) + .getReg(0); + } + B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, {DupSrc, Lane}); MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-duplane.mir @@ -118,3 +118,43 @@ RET_ReallyLR implicit $q0 ... +--- +name: v2s32_duplane32 +alignment: 4 +legalized: true +tracksRegLiveness: true +liveins: + - { reg: '$d0' } + - { reg: '$x0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.1: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: v2s32_duplane32 + ; CHECK: liveins: $d0, $d1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[DEF]](<2 x s32>) + ; CHECK: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[CONCAT_VECTORS]], [[C]](s64) + ; CHECK: $d0 = COPY [[DUPLANE32_]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + ; SELECTED-LABEL: name: v2s32_duplane32 + ; SELECTED: liveins: $d0, $d1 + ; SELECTED: [[COPY:%[0-9]+]]:fpr64 = COPY $d1 + ; SELECTED: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; SELECTED: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub + ; SELECTED: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 + ; SELECTED: $d0 = COPY [[DUPv2i32lane]] + ; SELECTED: RET_ReallyLR implicit $d0 + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s32>) = COPY $d1 + %2:_(<2 x s32>) = G_IMPLICIT_DEF + %3:_(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0) + $d0 = COPY %3(<2 x s32>) + RET_ReallyLR implicit $d0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fmul-indexed.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-fmul-indexed.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fmul-indexed.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: v2s32_fmul_indexed +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$d0' } + - { reg: '$x0' } +frameInfo: + maxAlignment: 1 +body: | + bb.1: + liveins: $d0, $x0 + + ; CHECK-LABEL: name: v2s32_fmul_indexed + ; CHECK: liveins: $d0, $x0 + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[COPY1]], 0 :: (load 8, align 4) + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[LDRDui]], %subreg.dsub + ; CHECK: [[FMULv2i32_indexed:%[0-9]+]]:fpr64 = FMULv2i32_indexed [[COPY]], [[INSERT_SUBREG]], 0 + ; CHECK: $d0 = COPY [[FMULv2i32_indexed]] + ; CHECK: RET_ReallyLR implicit $d0 + %0:fpr(<2 x s32>) = COPY $d0 + %1:gpr(p0) = COPY $x0 + %2:fpr(<2 x s32>) = G_LOAD %1(p0) :: (load 8, align 4) + %9:fpr(<2 x s32>) = G_IMPLICIT_DEF + %10:fpr(<4 x s32>) = G_CONCAT_VECTORS %2(<2 x s32>), %9(<2 x s32>) + %8:gpr(s64) = G_CONSTANT i64 0 + %5:fpr(<2 x s32>) = G_DUPLANE32 %10, %8(s64) + %7:fpr(<2 x s32>) = G_FMUL %0, %5 + $d0 = COPY %7(<2 x s32>) + RET_ReallyLR implicit $d0 + +...