diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v0.2d, v4.2d +; CHECK-NEXT: fneg v1.2d, v1.2d +; CHECK-NEXT: fmla v3.2d, v2.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v2.2d, v0.2d +; CHECK-NEXT: fadd v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fadd v1.2d, v2.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec31 + %1 = fmul contract <2 x double> %strided.vec28, %strided.vec30 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec30 + %4 = fmul contract <2 x double> %strided.vec28, %strided.vec31 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec33 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec34 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %6 = fadd contract <2 x double> %strided.vec33, %5 + %7 = fadd contract <2 x double> %2, %strided.vec34 + %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + c * d +define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec54 + %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec53 + %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60 + %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59 + %8 = fadd contract <2 x double> %7, %6 + %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59 + %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60 + %11 = fsub contract <2 x double> %9, %10 + %12 = fadd contract <2 x double> %5, %11 + %13 = fadd contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b - c * d +define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fsub v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fsub v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec54 + %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec53 + %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60 + %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59 + %8 = fadd contract <2 x double> %7, %6 + %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59 + %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60 + %11 = fsub contract <2 x double> %9, %10 + %12 = fsub contract <2 x double> %5, %11 + %13 = fsub contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + conj(c) * d +define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #270 +; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #270 +; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec62 + %1 = fmul contract <2 x double> %strided.vec59, %strided.vec61 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec61 + %4 = fmul contract <2 x double> %strided.vec59, %strided.vec62 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec64, %strided.vec68 + %7 = fmul contract <2 x double> %strided.vec65, %strided.vec67 + %8 = fsub contract <2 x double> %6, %7 + %9 = fmul contract <2 x double> %strided.vec64, %strided.vec67 + %10 = fmul contract <2 x double> %strided.vec65, %strided.vec68 + %11 = fadd contract <2 x double> %9, %10 + %12 = fadd contract <2 x double> %5, %11 + %13 = fadd contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a + b + 1i * c * d +define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0xffffffffffffffff +; CHECK-NEXT: zip2 v17.2d, v4.2d, v5.2d +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: zip1 v19.2d, v0.2d, v1.2d +; CHECK-NEXT: fneg v16.2d, v16.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v5.2d, v2.2d, v3.2d +; CHECK-NEXT: mov v4.16b, v16.16b +; CHECK-NEXT: bsl v4.16b, v18.16b, v17.16b +; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: bsl v3.16b, v18.16b, v1.16b +; CHECK-NEXT: fadd v1.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v7.2d, v0.2d, v2.2d +; CHECK-NEXT: fsub v3.2d, v3.2d, v17.2d +; CHECK-NEXT: fmul v16.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v19.2d, v2.2d +; CHECK-NEXT: fneg v7.2d, v7.2d +; CHECK-NEXT: fmul v4.2d, v3.2d, v4.2d +; CHECK-NEXT: fneg v16.2d, v16.2d +; CHECK-NEXT: fmla v2.2d, v5.2d, v0.2d +; CHECK-NEXT: fmla v7.2d, v5.2d, v19.2d +; CHECK-NEXT: fmla v4.2d, v1.2d, v6.2d +; CHECK-NEXT: fmla v16.2d, v6.2d, v3.2d +; CHECK-NEXT: fadd v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fadd v2.2d, v7.2d, v16.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec77 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec79 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec80 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec80 + %1 = fmul contract <2 x double> %strided.vec77, %strided.vec79 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec79 + %4 = fmul contract <2 x double> %strided.vec77, %strided.vec80 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec82 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec83 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %6 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec83) + %7 = fadd contract <2 x double> %strided.vec82, %6 + %8 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec82) + %9 = fsub contract <2 x double> %8, %strided.vec83 + %strided.vec85 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec86 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %10 = fmul contract <2 x double> %9, %strided.vec86 + %11 = fmul contract <2 x double> %strided.vec85, %7 + %12 = fadd contract <2 x double> %11, %10 + %13 = fmul contract <2 x double> %9, %strided.vec85 + %14 = fmul contract <2 x double> %7, %strided.vec86 + %15 = fsub contract <2 x double> %13, %14 + %16 = fadd contract <2 x double> %5, %15 + %17 = fadd contract <2 x double> %2, %12 + %interleaved.vec = shufflevector <2 x double> %16, <2 x double> %17, <4 x i32> + ret <4 x double> %interleaved.vec +} + +declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d +; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d +; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d +; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec31, %strided.vec + %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28 + %2 = fadd fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec30, %strided.vec + %strided.vec33 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec34 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %4 = fadd fast <2 x double> %strided.vec33, %3 + %5 = fmul fast <2 x double> %strided.vec31, %strided.vec28 + %6 = fsub fast <2 x double> %4, %5 + %7 = fadd fast <2 x double> %2, %strided.vec34 + %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + c * d +define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d +; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d +; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d +; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d +; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d +; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d +; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec54, %strided.vec + %1 = fmul fast <2 x double> %strided.vec53, %strided.vec51 + %2 = fmul fast <2 x double> %strided.vec53, %strided.vec + %3 = fmul fast <2 x double> %strided.vec54, %strided.vec51 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec60, %strided.vec56 + %5 = fmul fast <2 x double> %strided.vec59, %strided.vec57 + %6 = fmul fast <2 x double> %strided.vec59, %strided.vec56 + %7 = fmul fast <2 x double> %strided.vec60, %strided.vec57 + %8 = fadd fast <2 x double> %7, %3 + %9 = fadd fast <2 x double> %6, %2 + %10 = fsub fast <2 x double> %9, %8 + %11 = fadd fast <2 x double> %0, %1 + %12 = fadd fast <2 x double> %11, %5 + %13 = fadd fast <2 x double> %12, %4 + %interleaved.vec = shufflevector <2 x double> %10, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b - c * d +define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d +; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d +; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d +; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d +; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d +; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d +; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d +; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec55 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec56 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec56, %strided.vec + %1 = fmul fast <2 x double> %strided.vec55, %strided.vec53 + %2 = fmul fast <2 x double> %strided.vec55, %strided.vec + %3 = fmul fast <2 x double> %strided.vec56, %strided.vec53 + %strided.vec58 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec62, %strided.vec59 + %5 = fmul fast <2 x double> %strided.vec61, %strided.vec58 + %6 = fadd fast <2 x double> %5, %3 + %7 = fadd fast <2 x double> %4, %2 + %8 = fsub fast <2 x double> %7, %6 + %9 = fmul fast <2 x double> %strided.vec61, %strided.vec59 + %10 = fmul fast <2 x double> %strided.vec62, %strided.vec58 + %11 = fadd fast <2 x double> %10, %9 + %12 = fadd fast <2 x double> %0, %1 + %13 = fsub fast <2 x double> %12, %11 + %interleaved.vec = shufflevector <2 x double> %8, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + conj(c) * d +define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d +; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d +; CHECK-NEXT: fneg v3.2d, v3.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d +; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d +; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d +; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d +; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d +; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec62, %strided.vec + %1 = fmul fast <2 x double> %strided.vec61, %strided.vec59 + %2 = fmul fast <2 x double> %strided.vec61, %strided.vec + %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %3 = fmul fast <2 x double> %strided.vec68, %strided.vec64 + %4 = fmul fast <2 x double> %strided.vec67, %strided.vec64 + %5 = fmul fast <2 x double> %strided.vec68, %strided.vec65 + %6 = fmul fast <2 x double> %strided.vec62, %strided.vec59 + %7 = fsub fast <2 x double> %2, %6 + %8 = fadd fast <2 x double> %7, %4 + %9 = fadd fast <2 x double> %8, %5 + %10 = fadd fast <2 x double> %0, %1 + %11 = fmul fast <2 x double> %strided.vec67, %strided.vec65 + %12 = fsub fast <2 x double> %10, %11 + %13 = fadd fast <2 x double> %12, %3 + %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a + b + 1i * c * d +define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v18.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v19.2d, v16.2d, v17.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v17.2d +; CHECK-NEXT: zip2 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v19.2d, v3.2d, v18.2d +; CHECK-NEXT: fmla v4.2d, v0.2d, v16.2d +; CHECK-NEXT: fmla v19.2d, v1.2d, v5.2d +; CHECK-NEXT: fmla v4.2d, v1.2d, v18.2d +; CHECK-NEXT: fneg v1.2d, v19.2d +; CHECK-NEXT: fmls v4.2d, v3.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec79 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec81 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec82 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec82, %strided.vec + %1 = fmul fast <2 x double> %strided.vec81, %strided.vec79 + %2 = fmul fast <2 x double> %strided.vec81, %strided.vec + %3 = fmul fast <2 x double> %strided.vec82, %strided.vec79 + %strided.vec84 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec85 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec87 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec88 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec87, %strided.vec84 + %5 = fmul fast <2 x double> %strided.vec87, %strided.vec85 + %6 = fmul fast <2 x double> %strided.vec88, %strided.vec84 + %7 = fadd fast <2 x double> %5, %3 + %8 = fadd fast <2 x double> %7, %6 + %9 = fsub fast <2 x double> %2, %8 + %10 = fadd fast <2 x double> %0, %1 + %11 = fadd fast <2 x double> %10, %4 + %12 = fmul fast <2 x double> %strided.vec88, %strided.vec85 + %13 = fsub fast <2 x double> %11, %12 + %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; a * b + c +define @mull_add( %a, %b, %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z6.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z7.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z0.d, z6.d +; CHECK-NEXT: fmla z2.d, p0/m, z7.d, z1.d +; CHECK-NEXT: fmul z3.d, z7.d, z6.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z3.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z3.d, z4.d, z5.d +; CHECK-NEXT: fadd z3.d, z3.d, z0.d +; CHECK-NEXT: fadd z1.d, z2.d, z1.d +; CHECK-NEXT: zip1 z0.d, z3.d, z1.d +; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec29 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec29, 0 + %3 = extractvalue { , } %strided.vec29, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec31 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec31, 0 + %11 = extractvalue { , } %strided.vec31, 1 + %12 = fadd contract %10, %9 + %13 = fadd contract %6, %11 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %12, %13) + ret %interleaved.vec +} + +; a * b + c * d +define @mul_add_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fadd z0.d, z25.d, z27.d +; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec54, 0 + %11 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec56, 0 + %13 = extractvalue { , } %strided.vec56, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fadd contract %15, %14 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fsub contract %17, %18 + %20 = fadd contract %9, %19 + %21 = fadd contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a * b - c * d +define @mul_sub_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fsub z0.d, z25.d, z27.d +; CHECK-NEXT: fsub z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec54, 0 + %11 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec56, 0 + %13 = extractvalue { , } %strided.vec56, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fadd contract %15, %14 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fsub contract %17, %18 + %20 = fsub contract %9, %19 + %21 = fsub contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a * b + conj(c) * d +define @mul_conj_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 +; CHECK-NEXT: fadd z0.d, z25.d, z27.d +; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec60, 0 + %3 = extractvalue { , } %strided.vec60, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec62 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec62, 0 + %11 = extractvalue { , } %strided.vec62, 1 + %strided.vec64 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec64, 0 + %13 = extractvalue { , } %strided.vec64, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fsub contract %14, %15 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fadd contract %17, %18 + %20 = fadd contract %9, %19 + %21 = fadd contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a + b + 1i * c * d +define @mul_add_rot_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z24.d, z4.d, z5.d +; CHECK-NEXT: mov z26.d, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: and z26.d, z26.d, #0x7fffffffffffffff +; CHECK-NEXT: and z25.d, z25.d, #0x8000000000000000 +; CHECK-NEXT: uzp2 z27.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d +; CHECK-NEXT: orr z5.d, z26.d, z25.d +; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d +; CHECK-NEXT: fadd z5.d, z1.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: orr z1.d, z26.d, z1.d +; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d +; CHECK-NEXT: fsub z1.d, z1.d, z24.d +; CHECK-NEXT: uzp2 z24.d, z6.d, z7.d +; CHECK-NEXT: fmul z3.d, z0.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z6.d, z6.d, z7.d +; CHECK-NEXT: fmul z7.d, z1.d, z24.d +; CHECK-NEXT: fmla z3.d, p0/m, z27.d, z4.d +; CHECK-NEXT: fmla z7.d, p0/m, z6.d, z5.d +; CHECK-NEXT: fmul z2.d, z27.d, z2.d +; CHECK-NEXT: fmul z5.d, z5.d, z24.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: fnmsb z1.d, p0/m, z6.d, z5.d +; CHECK-NEXT: fadd z1.d, z0.d, z1.d +; CHECK-NEXT: fadd z2.d, z3.d, z7.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec78 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec78, 0 + %3 = extractvalue { , } %strided.vec78, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec80 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec80, 0 + %11 = extractvalue { , } %strided.vec80, 1 + %12 = tail call contract @llvm.copysign.nxv2f64( zeroinitializer, %11) + %13 = fadd contract %10, %12 + %14 = tail call contract @llvm.copysign.nxv2f64( zeroinitializer, %10) + %15 = fsub contract %14, %11 + %strided.vec82 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %16 = extractvalue { , } %strided.vec82, 0 + %17 = extractvalue { , } %strided.vec82, 1 + %18 = fmul contract %15, %17 + %19 = fmul contract %16, %13 + %20 = fadd contract %19, %18 + %21 = fmul contract %15, %16 + %22 = fmul contract %13, %17 + %23 = fsub contract %21, %22 + %24 = fadd contract %9, %23 + %25 = fadd contract %6, %20 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %24, %25) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) +declare @llvm.copysign.nxv2f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define @mull_add( %a, %b, %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d +; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d +; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z3.d +; CHECK-NEXT: zip2 z1.d, z1.d, z3.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec29 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec29, 0 + %3 = extractvalue { , } %strided.vec29, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fadd fast %4, %5 + %7 = fmul fast %2, %0 + %strided.vec31 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec31, 0 + %9 = extractvalue { , } %strided.vec31, 1 + %10 = fadd fast %8, %7 + %11 = fmul fast %3, %1 + %12 = fsub fast %10, %11 + %13 = fadd fast %6, %9 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %12, %13) + ret %interleaved.vec +} + +; a * b + c * d +define @mul_add_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d +; CHECK-NEXT: fmul z1.d, z1.d, z25.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d +; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d +; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d +; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fsub z1.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec54, 0 + %9 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec56, 0 + %11 = extractvalue { , } %strided.vec56, 1 + %12 = fmul fast %11, %8 + %13 = fmul fast %10, %9 + %14 = fmul fast %10, %8 + %15 = fmul fast %11, %9 + %16 = fadd fast %15, %7 + %17 = fadd fast %14, %6 + %18 = fsub fast %17, %16 + %19 = fadd fast %4, %5 + %20 = fadd fast %19, %13 + %21 = fadd fast %20, %12 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %18, %21) + ret %interleaved.vec +} + +; a * b - c * d +define @mul_sub_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: fmul z1.d, z1.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d +; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d +; CHECK-NEXT: fmul z3.d, z5.d, z3.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: fsub z1.d, z1.d, z0.d +; CHECK-NEXT: fsub z2.d, z2.d, z3.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec54, 0 + %3 = extractvalue { , } %strided.vec54, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec56, 0 + %9 = extractvalue { , } %strided.vec56, 1 + %strided.vec58 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec58, 0 + %11 = extractvalue { , } %strided.vec58, 1 + %12 = fmul fast %11, %9 + %13 = fmul fast %10, %8 + %14 = fadd fast %13, %7 + %15 = fadd fast %12, %6 + %16 = fsub fast %15, %14 + %17 = fmul fast %10, %9 + %18 = fmul fast %11, %8 + %19 = fadd fast %18, %17 + %20 = fadd fast %4, %5 + %21 = fsub fast %20, %19 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %16, %21) + ret %interleaved.vec +} + +; a * b + conj(c) * d +define @mul_conj_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d +; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d +; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d +; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d +; CHECK-NEXT: zip1 z0.d, z3.d, z1.d +; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec60, 0 + %3 = extractvalue { , } %strided.vec60, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %strided.vec62 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %7 = extractvalue { , } %strided.vec62, 0 + %8 = extractvalue { , } %strided.vec62, 1 + %strided.vec64 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %9 = extractvalue { , } %strided.vec64, 0 + %10 = extractvalue { , } %strided.vec64, 1 + %11 = fmul fast %10, %7 + %12 = fmul fast %9, %7 + %13 = fmul fast %10, %8 + %14 = fmul fast %3, %1 + %15 = fsub fast %6, %14 + %16 = fadd fast %15, %12 + %17 = fadd fast %16, %13 + %18 = fadd fast %4, %5 + %19 = fmul fast %9, %8 + %20 = fsub fast %18, %19 + %21 = fadd fast %20, %11 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %17, %21) + ret %interleaved.vec +} + +; a + b + 1i * c * d +define @mul_add_rot_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z24.d, z6.d, z7.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: fmla z0.d, p0/m, z24.d, z3.d +; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z4.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmls z2.d, p0/m, z5.d, z3.d +; CHECK-NEXT: fnmsb z1.d, p0/m, z25.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec80 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec80, 0 + %3 = extractvalue { , } %strided.vec80, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec82 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec82, 0 + %9 = extractvalue { , } %strided.vec82, 1 + %strided.vec84 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec84, 0 + %11 = extractvalue { , } %strided.vec84, 1 + %12 = fmul fast %10, %8 + %13 = fmul fast %10, %9 + %14 = fmul fast %11, %8 + %15 = fadd fast %13, %7 + %16 = fadd fast %15, %14 + %17 = fsub fast %6, %16 + %18 = fadd fast %4, %5 + %19 = fadd fast %18, %12 + %20 = fmul fast %11, %9 + %21 = fsub fast %19, %20 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %17, %21) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll @@ -293,3 +293,107 @@ ret <4 x float> %interleaved.vec136 } +; Expected to transform. Shows that composite common subexpression is not generated twice. +; u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]); +; v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]); +define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) { +; CHECK-LABEL: mul_add_common_mul_add_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q17, q16, [sp, #96] +; CHECK-NEXT: zip2 v20.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v21.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: ldp q19, q18, [sp, #64] +; CHECK-NEXT: zip2 v23.2d, v17.2d, v16.2d +; CHECK-NEXT: fmul v6.2d, v21.2d, v20.2d +; CHECK-NEXT: zip1 v16.2d, v17.2d, v16.2d +; CHECK-NEXT: zip2 v22.2d, v19.2d, v18.2d +; CHECK-NEXT: zip1 v18.2d, v19.2d, v18.2d +; CHECK-NEXT: fneg v6.2d, v6.2d +; CHECK-NEXT: fmul v20.2d, v5.2d, v20.2d +; CHECK-NEXT: fmul v7.2d, v22.2d, v23.2d +; CHECK-NEXT: fmla v6.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v5.2d, v2.2d, v3.2d +; CHECK-NEXT: fneg v7.2d, v7.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmla v7.2d, v18.2d, v16.2d +; CHECK-NEXT: fadd v19.2d, v7.2d, v6.2d +; CHECK-NEXT: fmla v20.2d, v4.2d, v21.2d +; CHECK-NEXT: zip2 v4.2d, v0.2d, v1.2d +; CHECK-NEXT: ldp q7, q6, [sp] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fmla v20.2d, v18.2d, v23.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fmla v20.2d, v22.2d, v16.2d +; CHECK-NEXT: mov v3.16b, v19.16b +; CHECK-NEXT: fmla v1.2d, v0.2d, v5.2d +; CHECK-NEXT: fmla v3.2d, v4.2d, v5.2d +; CHECK-NEXT: ldp q16, q4, [sp, #32] +; CHECK-NEXT: fneg v17.2d, v3.2d +; CHECK-NEXT: zip1 v3.2d, v7.2d, v6.2d +; CHECK-NEXT: zip2 v6.2d, v7.2d, v6.2d +; CHECK-NEXT: zip1 v5.2d, v16.2d, v4.2d +; CHECK-NEXT: fmla v17.2d, v0.2d, v2.2d +; CHECK-NEXT: fsub v18.2d, v1.2d, v20.2d +; CHECK-NEXT: zip2 v0.2d, v16.2d, v4.2d +; CHECK-NEXT: fmla v19.2d, v3.2d, v5.2d +; CHECK-NEXT: st2 { v17.2d, v18.2d }, [x0] +; CHECK-NEXT: fmls v19.2d, v6.2d, v0.2d +; CHECK-NEXT: fmla v20.2d, v6.2d, v5.2d +; CHECK-NEXT: fmla v20.2d, v3.2d, v0.2d +; CHECK-NEXT: st2 { v19.2d, v20.2d }, [x1] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec125, %strided.vec + %1 = fmul fast <2 x double> %strided.vec126, %strided.vec + %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123 + %3 = fadd fast <2 x double> %1, %2 + %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128 + %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129 + %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128 + %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129 + %8 = fsub fast <2 x double> %4, %5 + %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> + %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> + %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> + %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> + %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134 + %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135 + %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134 + %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138 + %13 = fsub fast <2 x double> %11, %12 + %14 = fadd fast <2 x double> %13, %8 + %15 = fadd fast <2 x double> %6, %7 + %16 = fadd fast <2 x double> %15, %9 + %17 = fadd fast <2 x double> %16, %10 + %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123 + %19 = fadd fast <2 x double> %18, %14 + %20 = fsub fast <2 x double> %0, %19 + %21 = fsub fast <2 x double> %3, %17 + %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> + store <4 x double> %interleaved.vec, ptr %p1, align 8 + %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> + %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> + %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> + %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> + %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140 + %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140 + %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141 + %25 = fadd fast <2 x double> %22, %14 + %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141 + %27 = fsub fast <2 x double> %25, %26 + %28 = fadd fast <2 x double> %24, %17 + %29 = fadd fast <2 x double> %28, %23 + %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> + store <4 x double> %interleaved.vec145, ptr %p2, align 8 + ret void +}