diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -3679,15 +3679,44 @@ // We're done, now find out what kind of splat we need. LLT VecTy = MRI.getType(I.getOperand(0).getReg()); LLT EltTy = VecTy.getElementType(); - if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) { - LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet"); + if (EltTy.getSizeInBits() < 32) { + LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); return false; } bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; - static const unsigned OpcTable[2][2] = { - {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr}, - {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}}; - unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64]; + unsigned Opc = 0; + if (IsFP) { + switch (EltTy.getSizeInBits()) { + case 32: + if (VecTy.getNumElements() == 2) { + Opc = AArch64::DUPv2i32lane; + } else { + Opc = AArch64::DUPv4i32lane; + assert(VecTy.getNumElements() == 4); + } + break; + case 64: + assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); + Opc = AArch64::DUPv2i64lane; + break; + } + } else { + switch (EltTy.getSizeInBits()) { + case 32: + if (VecTy.getNumElements() == 2) { + Opc = AArch64::DUPv2i32gpr; + } else { + Opc = AArch64::DUPv4i32gpr; + assert(VecTy.getNumElements() == 4); + } + break; + case 64: + assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); + Opc = AArch64::DUPv2i64gpr; + break; + } + } + assert(Opc && "Did not compute an opcode for a dup"); // For FP splats, we need to widen the scalar reg via undef too. if (IsFP) { diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -59,7 +59,7 @@ } getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) .fewerElementsIf( diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir @@ -37,11 +37,11 @@ # FIXME: s2 not correctly handled --- -name: test_implicit_def_v2s32 +name: test_implicit_def_v4s32 body: | bb.0: - ; CHECK-LABEL: name: test_implicit_def_v2s32 + ; CHECK-LABEL: name: test_implicit_def_v4s32 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) ; CHECK: $x0 = COPY [[UV]](<2 x s32>) @@ -67,3 +67,18 @@ $q0 = COPY %1 $q1 = COPY %2 ... +--- +name: test_implicit_def_v2s32 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v2s32 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) + ; CHECK: $w0 = COPY [[UV]](s32) + ; CHECK: $w1 = COPY [[UV1]](s32) + %0:_(<2 x s32>) = G_IMPLICIT_DEF + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0 + $w0 = COPY %1 + $w1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-shuffle-splat.mir @@ -49,6 +49,31 @@ $q0 = COPY %4(<2 x s64>) RET_ReallyLR implicit $q0 +... +--- +name: splat_2xi32 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + + ; CHECK-LABEL: name: splat_2xi32 + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK: [[DUPv2i32gpr:%[0-9]+]]:fpr64 = DUPv2i32gpr [[COPY]] + ; CHECK: $d0 = COPY [[DUPv2i32gpr]] + ; CHECK: RET_ReallyLR implicit $d0 + %0:gpr(s32) = COPY $w0 + %2:fpr(<2 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0) + $d0 = COPY %4(<2 x s32>) + RET_ReallyLR implicit $d0 + ... --- name: splat_4xf32 @@ -103,6 +128,33 @@ $q0 = COPY %4(<2 x s64>) RET_ReallyLR implicit $q0 +... +--- +name: splat_2xf32 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $s0 + + ; CHECK-LABEL: name: splat_2xf32 + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub + ; CHECK: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 + ; CHECK: $d0 = COPY [[DUPv2i32lane]] + ; CHECK: RET_ReallyLR implicit $d0 + %0:fpr(s32) = COPY $s0 + %2:fpr(<2 x s32>) = G_IMPLICIT_DEF + %3:gpr(s32) = G_CONSTANT i32 0 + %1:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + %4:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2, shufflemask(0, 0) + $d0 = COPY %4(<2 x s32>) + RET_ReallyLR implicit $d0 + ... --- name: splat_2xf64_copies diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -295,8 +295,7 @@ ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: adrp x8, .LCPI13_0 ; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0] -; FALLBACK-NEXT: mov.s v2[1], w8 -; FALLBACK-NEXT: mov.d v0[1], v2[0] +; FALLBACK-NEXT: mov.d v0[1], v0[0] ; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1 ; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; FALLBACK-NEXT: ret @@ -317,8 +316,7 @@ ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: adrp x8, .LCPI14_0 ; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] -; FALLBACK-NEXT: mov.s v2[1], w8 -; FALLBACK-NEXT: mov.d v0[1], v2[0] +; FALLBACK-NEXT: mov.d v0[1], v0[0] ; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1 ; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; FALLBACK-NEXT: ret