diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll @@ -0,0 +1,305 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" +; Expected to not transform +; *p = (a * b); +; return (a * b) * a; +define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: mul_triangle: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fmul v6.2s, v5.2s, v4.2s +; CHECK-NEXT: fneg v2.2s, v6.2s +; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s +; CHECK-NEXT: fmul v3.2s, v4.2s, v1.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v5.2s +; CHECK-NEXT: fmul v1.2s, v3.2s, v4.2s +; CHECK-NEXT: fmul v5.2s, v3.2s, v0.2s +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x0] +; CHECK-NEXT: fneg v1.2s, v1.2s +; CHECK-NEXT: fmla v5.2s, v4.2s, v2.2s +; CHECK-NEXT: fmla v1.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v5.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> + store <4 x float> %otheruse, ptr %p + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform. Shows that external use prevents deinterleaving. +; *p = (a * b).real(); +; return (a * b) * a; +define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: mul_triangle_external_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v4.2s, v5.2s +; CHECK-NEXT: fmul v3.2s, v1.2s, v4.2s +; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s +; CHECK-NEXT: fneg v1.2s, v3.2s +; CHECK-NEXT: fmul v3.2s, v2.2s, v4.2s +; CHECK-NEXT: fmla v1.2s, v0.2s, v5.2s +; CHECK-NEXT: fmul v5.2s, v2.2s, v0.2s +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: fneg v3.2s, v3.2s +; CHECK-NEXT: fmla v5.2s, v4.2s, v1.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s +; CHECK-NEXT: zip1 v0.4s, v3.4s, v5.4s +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + store <2 x float> %6, ptr %p + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform partially (only d * c). Shows that external use of shufflevector does not prevent deinterleaving. +; *p1 = (a * b).real(); +; *p2 = (a * b) * c; +; return d * c; +define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { +; CHECK-LABEL: multiple_muls_shuffle_external: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: zip2 v7.2s, v0.2s, v5.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v5.2s +; CHECK-NEXT: zip1 v16.2s, v1.2s, v6.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v6.2s +; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: fcmla v4.4s, v3.4s, v2.4s, #0 +; CHECK-NEXT: fmul v5.2s, v16.2s, v7.2s +; CHECK-NEXT: fmul v7.2s, v1.2s, v7.2s +; CHECK-NEXT: fcmla v4.4s, v3.4s, v2.4s, #90 +; CHECK-NEXT: fmla v5.2s, v0.2s, v1.2s +; CHECK-NEXT: fneg v1.2s, v7.2s +; CHECK-NEXT: zip1 v7.2s, v2.2s, v6.2s +; CHECK-NEXT: zip2 v6.2s, v2.2s, v6.2s +; CHECK-NEXT: fmla v1.2s, v0.2s, v16.2s +; CHECK-NEXT: fmul v17.2s, v7.2s, v5.2s +; CHECK-NEXT: fmul v0.2s, v6.2s, v5.2s +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: fmla v17.2s, v1.2s, v6.2s +; CHECK-NEXT: fneg v16.2s, v0.2s +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: fmla v16.2s, v1.2s, v7.2s +; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x1] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec91, %strided.vec + %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88 + %2 = fadd fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %strided.vec90, %strided.vec + %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88 + %5 = fsub fast <2 x float> %3, %4 + store <2 x float> %5, ptr %p1 + %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %strided.vec94, %5 + %7 = fmul fast <2 x float> %strided.vec93, %2 + %8 = fadd fast <2 x float> %6, %7 + %9 = fmul fast <2 x float> %strided.vec93, %5 + %10 = fmul fast <2 x float> %strided.vec94, %2 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + store <4 x float> %interleaved.vec, ptr %p2 + %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94 + %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93 + %14 = fadd fast <2 x float> %13, %12 + %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93 + %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94 + %17 = fsub fast <2 x float> %15, %16 + %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> + ret <4 x float> %interleaved.vec98 +} + +; Same as above but data are loaded from memory instead of being passes as arguments. +; Expected to transform partially (only d * c). +; Shows that ld2 is not generated for `c` although it used by both complex `d * c` and non-complex `(a * b) * c` instruction chains. +define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_d, ptr %p1, ptr %p2) { +; CHECK-LABEL: multiple_muls_shuffle_external_with_loads: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ld2 { v1.2s, v2.2s }, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ld2 { v3.2s, v4.2s }, [x1] +; CHECK-NEXT: fmul v5.2s, v4.2s, v2.2s +; CHECK-NEXT: fmul v7.2s, v3.2s, v2.2s +; CHECK-NEXT: fneg v5.2s, v5.2s +; CHECK-NEXT: fmla v7.2s, v1.2s, v4.2s +; CHECK-NEXT: fmla v5.2s, v1.2s, v3.2s +; CHECK-NEXT: str d5, [x4] +; CHECK-NEXT: ldr q6, [x2] +; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8 +; CHECK-NEXT: zip1 v1.2s, v6.2s, v16.2s +; CHECK-NEXT: zip2 v2.2s, v6.2s, v16.2s +; CHECK-NEXT: fmul v4.2s, v1.2s, v7.2s +; CHECK-NEXT: fmul v7.2s, v2.2s, v7.2s +; CHECK-NEXT: fmla v4.2s, v5.2s, v2.2s +; CHECK-NEXT: fneg v3.2s, v7.2s +; CHECK-NEXT: fmla v3.2s, v5.2s, v1.2s +; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x5] +; CHECK-NEXT: ldr q1, [x3] +; CHECK-NEXT: fcmla v0.4s, v1.4s, v6.4s, #0 +; CHECK-NEXT: fcmla v0.4s, v1.4s, v6.4s, #90 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %ptr_a + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b = load <4 x float>, ptr %ptr_b + %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec91, %strided.vec + %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88 + %2 = fadd fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %strided.vec90, %strided.vec + %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88 + %5 = fsub fast <2 x float> %3, %4 + store <2 x float> %5, ptr %p1 + %c = load <4 x float>, ptr %ptr_c + %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %strided.vec94, %5 + %7 = fmul fast <2 x float> %strided.vec93, %2 + %8 = fadd fast <2 x float> %6, %7 + %9 = fmul fast <2 x float> %strided.vec93, %5 + %10 = fmul fast <2 x float> %strided.vec94, %2 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + store <4 x float> %interleaved.vec, ptr %p2 + %d = load <4 x float>, ptr %ptr_d + %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94 + %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93 + %14 = fadd fast <2 x float> %13, %12 + %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93 + %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94 + %17 = fsub fast <2 x float> %15, %16 + %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> + ret <4 x float> %interleaved.vec98 +} + +; Expected to not transform. Shows that external use prevents deinterleaving whole chain. +; *p1 = (a * b).real(); +; *p2 = (a * b) * (d * c); +; return d * c; +define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { +; CHECK-LABEL: multiple_muls_mul_external: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: zip2 v16.2s, v0.2s, v5.2s +; CHECK-NEXT: zip2 v17.2s, v1.2s, v6.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v5.2s +; CHECK-NEXT: zip1 v1.2s, v1.2s, v6.2s +; CHECK-NEXT: zip1 v18.2s, v2.2s, v7.2s +; CHECK-NEXT: zip2 v2.2s, v2.2s, v7.2s +; CHECK-NEXT: zip2 v7.2s, v3.2s, v4.2s +; CHECK-NEXT: zip1 v3.2s, v3.2s, v4.2s +; CHECK-NEXT: fmul v19.2s, v16.2s, v17.2s +; CHECK-NEXT: fmul v5.2s, v18.2s, v7.2s +; CHECK-NEXT: fmul v6.2s, v2.2s, v7.2s +; CHECK-NEXT: fneg v4.2s, v19.2s +; CHECK-NEXT: fmul v7.2s, v0.2s, v17.2s +; CHECK-NEXT: fmla v5.2s, v3.2s, v2.2s +; CHECK-NEXT: fneg v2.2s, v6.2s +; CHECK-NEXT: fmla v4.2s, v1.2s, v0.2s +; CHECK-NEXT: fmla v7.2s, v1.2s, v16.2s +; CHECK-NEXT: fmla v2.2s, v3.2s, v18.2s +; CHECK-NEXT: fmul v17.2s, v4.2s, v5.2s +; CHECK-NEXT: fmul v0.2s, v7.2s, v5.2s +; CHECK-NEXT: str d4, [x0] +; CHECK-NEXT: fmla v17.2s, v2.2s, v7.2s +; CHECK-NEXT: fneg v16.2s, v0.2s +; CHECK-NEXT: zip1 v0.4s, v2.4s, v5.4s +; CHECK-NEXT: fmla v16.2s, v2.2s, v4.2s +; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x1] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec126 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec128 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec129 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec129 + %1 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec128 + %2 = fadd nnan ninf contract <2 x float> %1, %0 + %3 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec128 + %4 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec129 + %5 = fsub nnan ninf contract <2 x float> %3, %4 + store <2 x float> %5, ptr %p1 + %strided.vec131 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec132 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec134 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %strided.vec135 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %6 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec135 + %7 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec134 + %8 = fadd nnan ninf contract <2 x float> %7, %6 + %9 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec134 + %10 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec135 + %11 = fsub nnan ninf contract <2 x float> %9, %10 + %12 = fmul nnan ninf contract <2 x float> %5, %8 + %13 = fmul nnan ninf contract <2 x float> %2, %11 + %14 = fadd nnan ninf contract <2 x float> %13, %12 + %15 = fmul nnan ninf contract <2 x float> %5, %11 + %16 = fmul nnan ninf contract <2 x float> %2, %8 + %17 = fsub nnan ninf contract <2 x float> %15, %16 + %interleaved.vec = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> + store <4 x float> %interleaved.vec, ptr %p2 + %interleaved.vec136 = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec136 +} +