diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16179,6 +16179,33 @@ SDValue Op1 = N->getOperand(1); EVT ResVT = N->getValueType(0); + // uzp1(x, undef) -> concat(truncate(x), undef) + if (Op1.getOpcode() == ISD::UNDEF) { + EVT BCVT = MVT::Other, HalfVT = MVT::Other; + switch (ResVT.getSimpleVT().SimpleTy) { + default: + break; + case MVT::v16i8: + BCVT = MVT::v8i16; + HalfVT = MVT::v8i8; + break; + case MVT::v8i16: + BCVT = MVT::v4i32; + HalfVT = MVT::v4i16; + break; + case MVT::v4i32: + BCVT = MVT::v2i64; + HalfVT = MVT::v2i32; + break; + } + if (BCVT != MVT::Other) { + SDValue BC = DAG.getBitcast(BCVT, Op0); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, + DAG.getUNDEF(HalfVT)); + } + } + // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll @@ -260,11 +260,9 @@ define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) { ; CHECK-LABEL: load_v16i8_4_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov v0.s[0], v1.s[0] ; CHECK-NEXT: ret %l = load <4 x i8>, <4 x i8> *%a %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> @@ -275,11 +273,9 @@ define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) { ; CHECK-LABEL: load_v16i8_4_15: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1 -; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret @@ -292,11 +288,9 @@ define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) { ; CHECK-LABEL: load_v16i8_4_2: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: ret %l = load <4 x i8>, <4 x i8> *%a %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> @@ -307,11 +301,9 @@ define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) { ; CHECK-LABEL: load_v16i8_4_3: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov v0.s[2], v1.s[0] ; CHECK-NEXT: ret %l = load <4 x i8>, <4 x i8> *%a %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> @@ -322,11 +314,9 @@ define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, <4 x i8> *%a) { ; CHECK-LABEL: load_v16i8_4_4: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[3], v2.s[0] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: ret %l = load <4 x i8>, <4 x i8> *%a %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> @@ -399,11 +389,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: ld1 { v2.h }[2], [x8] +; CHECK-NEXT: xtn v1.4h, v2.4s +; CHECK-NEXT: mov v0.s[0], v1.s[0] ; CHECK-NEXT: ret %l = load <2 x i16>, <2 x i16> *%a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> @@ -420,9 +410,9 @@ ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: ld1 { v2.h }[2], [x8] ; CHECK-NEXT: adrp x8, .LCPI33_0 -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI33_0] +; CHECK-NEXT: xtn v0.4h, v2.4s +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v3.16b ; CHECK-NEXT: ret %l = load <2 x i16>, <2 x i16> *%a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> @@ -435,11 +425,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: ld1 { v2.h }[2], [x8] +; CHECK-NEXT: xtn v1.4h, v2.4s +; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: ret %l = load <2 x i16>, <2 x i16> *%a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> @@ -452,11 +442,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: ld1 { v2.h }[2], [x8] +; CHECK-NEXT: xtn v1.4h, v2.4s +; CHECK-NEXT: mov v0.s[2], v1.s[0] ; CHECK-NEXT: ret %l = load <2 x i16>, <2 x i16> *%a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> @@ -469,11 +459,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.s[3], v2.s[0] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: ld1 { v2.h }[2], [x8] +; CHECK-NEXT: xtn v1.4h, v2.4s +; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: ret %l = load <2 x i16>, <2 x i16> *%a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll --- a/llvm/test/CodeGen/AArch64/neon-perm.ll +++ b/llvm/test/CodeGen/AArch64/neon-perm.ll @@ -2203,7 +2203,7 @@ define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) { ; CHECK-LABEL: test_undef_vuzp1q_s8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> @@ -2223,7 +2223,7 @@ define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) { ; CHECK-LABEL: test_undef_vuzp1q_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> @@ -2233,7 +2233,7 @@ define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) { ; CHECK-LABEL: test_undef_vuzp1q_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -2253,7 +2253,7 @@ define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) { ; CHECK-LABEL: test_undef_vuzp1q_u8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> @@ -2273,7 +2273,7 @@ define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) { ; CHECK-LABEL: test_undef_vuzp1q_u16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> @@ -2283,7 +2283,7 @@ define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) { ; CHECK-LABEL: test_undef_vuzp1q_u32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -2313,7 +2313,7 @@ define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) { ; CHECK-LABEL: test_undef_vuzp1q_p8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> @@ -2333,7 +2333,7 @@ define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) { ; CHECK-LABEL: test_undef_vuzp1q_p16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>