diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16878,6 +16878,17 @@ } } + // concat(zip1(a, b), zip2(a, b)) is zip1(a, b) + if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 && + N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) && + N0.getOperand(1) == N1.getOperand(1)) { + SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0), + DAG.getUNDEF(N0.getValueType())); + SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1), + DAG.getUNDEF(N0.getValueType())); + return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1); + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll --- a/llvm/test/CodeGen/AArch64/arm64-zip.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll @@ -142,9 +142,9 @@ define <16 x i8> @combine_v16i8(<8 x i8> %0, <8 x i8> %1) { ; CHECK-LABEL: combine_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2.8b v2, v0, v1 -; CHECK-NEXT: zip1.8b v0, v0, v1 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> ret <16 x i8> %3 @@ -153,10 +153,9 @@ define <16 x i8> @combine2_v16i8(<8 x i8> %0, <8 x i8> %1) { ; CHECK-LABEL: combine2_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip1.8b v2, v0, v1 -; CHECK-NEXT: zip2.8b v0, v0, v1 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> %4 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> @@ -167,9 +166,9 @@ define <8 x i16> @combine_v8i16(<4 x i16> %0, <4 x i16> %1) { ; CHECK-LABEL: combine_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2.4h v2, v0, v1 -; CHECK-NEXT: zip1.4h v0, v0, v1 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.8h v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> ret <8 x i16> %3 @@ -178,10 +177,9 @@ define <8 x i16> @combine2_v8i16(<4 x i16> %0, <4 x i16> %1) { ; CHECK-LABEL: combine2_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip1.4h v2, v0, v1 -; CHECK-NEXT: zip2.4h v0, v0, v1 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.8h v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> %4 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> @@ -192,9 +190,9 @@ define <4 x i32> @combine_v4i32(<2 x i32> %0, <2 x i32> %1) { ; CHECK-LABEL: combine_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2.2s v2, v0, v1 -; CHECK-NEXT: zip1.2s v0, v0, v1 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.4s v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> ret <4 x i32> %3 @@ -203,10 +201,9 @@ define <4 x i32> @combine2_v4i32(<2 x i32> %0, <2 x i32> %1) { ; CHECK-LABEL: combine2_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip1.2s v2, v0, v1 -; CHECK-NEXT: zip2.2s v0, v0, v1 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.4s v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> %4 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> @@ -217,9 +214,9 @@ define <16 x i8> @combine_v16i8_undef(<8 x i8> %0, <8 x i8> %1) { ; CHECK-LABEL: combine_v16i8_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2.8b v2, v0, v1 -; CHECK-NEXT: zip1.8b v0, v0, v1 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> ret <16 x i8> %3 @@ -228,10 +225,9 @@ define <16 x i8> @combine2_v16i8_undef(<8 x i8> %0, <8 x i8> %1) { ; CHECK-LABEL: combine2_v16i8_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: zip1.8b v2, v0, v1 -; CHECK-NEXT: zip2.8b v0, v0, v1 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> %4 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> @@ -242,9 +238,9 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) { ; CHECK-LABEL: combine_v8i16_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2.4h v2, v0, v1 -; CHECK-NEXT: zip1.4h v0, v0, v1 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.8h v0, v0, v1 ; CHECK-NEXT: ret %3 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> ret <8 x i16> %3 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -56,9 +56,7 @@ ; CHECK-NEXT: fneg v4.2s, v5.2s ; CHECK-NEXT: fmla v3.2s, v0.2s, v2.2s ; CHECK-NEXT: fmla v4.2s, v1.2s, v2.2s -; CHECK-NEXT: zip2 v1.2s, v4.2s, v3.2s -; CHECK-NEXT: zip1 v0.2s, v4.2s, v3.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.4s, v4.4s, v3.4s ; CHECK-NEXT: ret entry: %0 = fsub fast <4 x float> %b, %c @@ -100,9 +98,7 @@ ; CHECK-NEXT: fneg v3.2s, v3.2s ; CHECK-NEXT: fmla v2.2s, v7.2s, v1.2s ; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s -; CHECK-NEXT: zip2 v1.2s, v3.2s, v2.2s -; CHECK-NEXT: zip1 v0.2s, v3.2s, v2.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.4s, v3.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> @@ -281,9 +277,7 @@ ; CHECK-NEXT: fmla v16.2s, v0.2s, v5.2s ; CHECK-NEXT: fsub v0.2s, v7.2s, v16.2s ; CHECK-NEXT: fadd v1.2s, v6.2s, v3.2s -; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret entry: %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -334,10 +328,8 @@ ; CHECK-NEXT: fmla v5.2s, v4.2s, v1.2s ; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s ; CHECK-NEXT: mov v1.d[1], v2.d[0] -; CHECK-NEXT: zip2 v4.2s, v3.2s, v5.2s -; CHECK-NEXT: zip1 v0.2s, v3.2s, v5.2s +; CHECK-NEXT: zip1 v0.4s, v3.4s, v5.4s ; CHECK-NEXT: str q1, [x0] -; CHECK-NEXT: mov v0.d[1], v4.d[0] ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -42,9 +42,7 @@ ; CHECK-NEXT: fmul v4.2s, v0.2s, v5.2s ; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s ; CHECK-NEXT: fsub v0.2s, v3.2s, v4.2s -; CHECK-NEXT: zip2 v1.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -125,9 +123,7 @@ ; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s ; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s ; CHECK-NEXT: fsub v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v2.2s, v1.2s, v0.2s -; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -169,9 +165,7 @@ ; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s ; CHECK-NEXT: fsub v1.2s, v4.2s, v1.2s ; CHECK-NEXT: fadd v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v2.2s, v1.2s, v0.2s -; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32>