diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -38,6 +38,7 @@ // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, + Splat, Symmetric, ReductionPHI, ReductionOperation, diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -369,6 +369,12 @@ /// intrinsic (for both fixed and scalable vectors) NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); + /// identifying the operation that represents a complex number repeated in a + /// Splat vector. There are two possible types of splats: ConstantExpr with + /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an + /// initialization mask with all values set to zero. + NodePtr identifySplat(Value *Real, Value *Imag); + NodePtr identifyPHINode(Instruction *Real, Instruction *Imag); /// Identifies SelectInsts in a loop that has reduction with predication masks @@ -863,6 +869,9 @@ return CN; } + if (NodePtr CN = identifySplat(R, I)) + return CN; + auto *Real = dyn_cast(R); auto *Imag = dyn_cast(I); if (!Real || !Imag) @@ -1690,6 +1699,59 @@ return submitCompositeNode(PlaceholderNode); } +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { + auto IsSplat = [](Value *V) -> bool { + // Fixed-width vector with constants + if (isa(V)) + return true; + + VectorType *VTy; + ArrayRef Mask; + // Splats are represented differently depending on whether the repeated + // value is a constant or an Instruction + if (auto *Const = dyn_cast(V)) { + if (Const->getOpcode() != Instruction::ShuffleVector) + return false; + VTy = cast(Const->getType()); + Mask = Const->getShuffleMask(); + } else if (auto *Shuf = dyn_cast(V)) { + VTy = Shuf->getType(); + Mask = Shuf->getShuffleMask(); + } else { + return false; + } + + // When the data type is <1 x Type>, it's not possible to differentiate + // between the ComplexDeinterleaving::Deinterleave and + // ComplexDeinterleaving::Splat operations. + if (!VTy->isScalableTy() && VTy->getElementCount().getKnownMinValue() == 1) + return false; + + return all_equal(Mask) && Mask[0] == 0; + }; + + if (!IsSplat(R) || !IsSplat(I)) + return nullptr; + + auto *Real = dyn_cast(R); + auto *Imag = dyn_cast(I); + if ((!Real && Imag) || (Real && !Imag)) + return nullptr; + + if (Real && Imag) { + // Non-constant splats should be in the same basic block + if (Real->getParent() != Imag->getParent()) + return nullptr; + + FinalInstructions.insert(Real); + FinalInstructions.insert(Imag); + } + NodePtr PlaceholderNode = + prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I); + return submitCompositeNode(PlaceholderNode); +} + ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { @@ -1800,6 +1862,25 @@ case ComplexDeinterleavingOperation::Deinterleave: llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; + case ComplexDeinterleavingOperation::Splat: { + auto *NewTy = VectorType::getDoubleElementsVectorType( + cast(Node->Real->getType())); + auto *R = dyn_cast(Node->Real); + auto *I = dyn_cast(Node->Imag); + if (R && I) { + // Splats that are not constant are interleaved where they are located + Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); + IRBuilder<> IRB(InsertPoint); + ReplacementNode = + IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy, + {Node->Real, Node->Imag}); + } else { + ReplacementNode = + Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, + NewTy, {Node->Real, Node->Imag}); + } + break; + } case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a[i] * b[i] * (11.0 + 3.0.i); +; +define @complex_mul_const( %a, %b) { +; CHECK-LABEL: complex_mul_const: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z6.d, z4.d +; CHECK-NEXT: fcmla z5.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z6.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z5.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z6.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fmov z1.d, #3.00000000 +; CHECK-NEXT: fmov z2.d, #11.00000000 +; CHECK-NEXT: zip2 z3.d, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d +; CHECK-NEXT: fcmla z4.d, p0/m, z6.d, z3.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z1.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z6.d, z3.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z1.d, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec48 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec48, 0 + %3 = extractvalue { , } %strided.vec48, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fadd fast %4, %5 + %7 = fmul fast %2, %0 + %8 = fmul fast %3, %1 + %9 = fsub fast %7, %8 + %10 = fmul fast %9, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %11 = fmul fast %6, shufflevector ( insertelement ( poison, double 1.100000e+01, i64 0), poison, zeroinitializer) + %12 = fadd fast %10, %11 + %13 = fmul fast %9, shufflevector ( insertelement ( poison, double 1.100000e+01, i64 0), poison, zeroinitializer) + %14 = fmul fast %6, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer) + %15 = fsub fast %13, %14 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %15, %12) + ret %interleaved.vec +} + +; a[i] * b[i] * c; +; +define @complex_mul_non_const( %a, %b, [2 x double] %c) { +; CHECK-LABEL: complex_mul_non_const: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z6.d, #0 // =0x0 +; CHECK-NEXT: // kill: def $d5 killed $d5 def $z5 +; CHECK-NEXT: // kill: def $d4 killed $d4 def $z4 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z7.d, z6.d +; CHECK-NEXT: mov z24.d, z6.d +; CHECK-NEXT: mov z5.d, d5 +; CHECK-NEXT: mov z4.d, d4 +; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: zip2 z2.d, z4.d, z5.d +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z6.d +; CHECK-NEXT: zip1 z4.d, z4.d, z5.d +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z2.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z2.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: mov z1.d, z6.d +; CHECK-NEXT: ret +entry: + %c.coerce.fca.0.extract = extractvalue [2 x double] %c, 0 + %c.coerce.fca.1.extract = extractvalue [2 x double] %c, 1 + %broadcast.splatinsert = insertelement poison, double %c.coerce.fca.1.extract, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %broadcast.splatinsert49 = insertelement poison, double %c.coerce.fca.0.extract, i64 0 + %broadcast.splat50 = shufflevector %broadcast.splatinsert49, poison, zeroinitializer + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec48 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec48, 0 + %3 = extractvalue { , } %strided.vec48, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fadd fast %4, %5 + %7 = fmul fast %2, %0 + %8 = fmul fast %3, %1 + %9 = fsub fast %7, %8 + %10 = fmul fast %9, %broadcast.splat + %11 = fmul fast %6, %broadcast.splat50 + %12 = fadd fast %10, %11 + %13 = fmul fast %9, %broadcast.splat50 + %14 = fmul fast %6, %broadcast.splat + %15 = fsub fast %13, %14 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %15, %12) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + +; a[i] * b[i] * (11.0 + 3.0.i); +; +define <4 x double> @complex_mul_const(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_const: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: fcmla v6.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v6.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.2d, v2.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v2.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v2.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v2.2d, v5.2d, #90 +; CHECK-NEXT: mov v1.16b, v4.16b +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec47 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec50 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec50, %strided.vec + %1 = fmul fast <2 x double> %strided.vec49, %strided.vec47 + %2 = fadd fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec49, %strided.vec + %4 = fmul fast <2 x double> %strided.vec50, %strided.vec47 + %5 = fsub fast <2 x double> %3, %4 + %6 = fmul fast <2 x double> %5, + %7 = fmul fast <2 x double> %2, + %8 = fadd fast <2 x double> %6, %7 + %9 = fmul fast <2 x double> %5, + %10 = fmul fast <2 x double> %2, + %11 = fsub fast <2 x double> %9, %10 + %interleaved.vec = shufflevector <2 x double> %11, <2 x double> %8, <4 x i32> + ret <4 x double> %interleaved.vec +} + + +; a[i] * b[i] * c; +; +define <4 x double> @complex_mul_non_const(<4 x double> %a, <4 x double> %b, [2 x double] %c) { +; CHECK-LABEL: complex_mul_non_const: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: mov v4.d[1], v5.d[0] +; CHECK-NEXT: fcmla v6.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v7.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v6.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v7.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.2d, v4.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v4.2d, v7.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %c.coerce.fca.1.extract = extractvalue [2 x double] %c, 1 + %c.coerce.fca.0.extract = extractvalue [2 x double] %c, 0 + %broadcast.splatinsert = insertelement <2 x double> poison, double %c.coerce.fca.1.extract, i64 0 + %broadcast.splat = shufflevector <2 x double> %broadcast.splatinsert, <2 x double> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert51 = insertelement <2 x double> poison, double %c.coerce.fca.0.extract, i64 0 + %broadcast.splat52 = shufflevector <2 x double> %broadcast.splatinsert51, <2 x double> poison, <2 x i32> zeroinitializer + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec47 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec50 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec50, %strided.vec + %1 = fmul fast <2 x double> %strided.vec49, %strided.vec47 + %2 = fadd fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec49, %strided.vec + %4 = fmul fast <2 x double> %strided.vec50, %strided.vec47 + %5 = fsub fast <2 x double> %3, %4 + %6 = fmul fast <2 x double> %5, %broadcast.splat + %7 = fmul fast <2 x double> %2, %broadcast.splat52 + %8 = fadd fast <2 x double> %6, %7 + %9 = fmul fast <2 x double> %5, %broadcast.splat52 + %10 = fmul fast <2 x double> %2, %broadcast.splat + %11 = fsub fast <2 x double> %9, %10 + %interleaved.vec = shufflevector <2 x double> %11, <2 x double> %8, <4 x i32> + ret <4 x double> %interleaved.vec +}