diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -38,7 +38,7 @@ CMulPartial, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. - Shuffle, + Deinterleave, Symmetric }; diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -18,6 +18,11 @@ // pairs. Validity of each node is expected to be done upon creation, and any // validation errors should halt traversal and prevent further graph // construction. +// Instead of relying on Shuffle operations, vector interleaving and +// deinterleaving can be represented by vector.interleave2 and +// vector.deinterleave2 intrinsics. Scalable vectors can be represented only by +// these intrinsics, whereas, fixed-width vectors are recognized for both +// shufflevector instruction and intrinsics. // // Replacement: // This step traverses the graph built up by identification, delegating to the @@ -250,6 +255,17 @@ NodePtr identifyNode(Instruction *I, Instruction *J); + NodePtr identifyRoot(Instruction *I); + + /// Identifies the Deinterleave operation applied to a vector containing + /// complex numbers. There are two ways to represent the Deinterleave + /// operation: + /// * Using two shufflevectors with even indices for /pReal instruction and + /// odd indices for /pImag instructions (only for fixed-width vectors) + /// * Using two extractvalue instructions applied to `vector.deinterleave2` + /// intrinsic (for both fixed and scalable vectors) + NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); + Value *replaceNode(RawNodePtr Node); public: @@ -365,19 +381,8 @@ bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { ComplexDeinterleavingGraph Graph(TL, TLI); - - for (auto &I : *B) { - auto *SVI = dyn_cast(&I); - if (!SVI) - continue; - - // Look for a shufflevector that takes separate vectors of the real and - // imaginary components and recombines them into a single vector. - if (!isInterleavingMask(SVI->getShuffleMask())) - continue; - - Graph.identifyNodes(SVI); - } + for (auto &I : *B) + Graph.identifyNodes(&I); if (Graph.checkNodes()) { Graph.replaceNodes(); @@ -748,100 +753,12 @@ return CN; } - auto *RealShuffle = dyn_cast(Real); - auto *ImagShuffle = dyn_cast(Imag); - if (RealShuffle && ImagShuffle) { - Value *RealOp1 = RealShuffle->getOperand(1); - if (!isa(RealOp1) && !isa(RealOp1)) { - LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); - return nullptr; - } - Value *ImagOp1 = ImagShuffle->getOperand(1); - if (!isa(ImagOp1) && !isa(ImagOp1)) { - LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); - return nullptr; - } - - Value *RealOp0 = RealShuffle->getOperand(0); - Value *ImagOp0 = ImagShuffle->getOperand(0); - - if (RealOp0 != ImagOp0) { - LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); - return nullptr; - } - - ArrayRef RealMask = RealShuffle->getShuffleMask(); - ArrayRef ImagMask = ImagShuffle->getShuffleMask(); - if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { - LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); - return nullptr; - } - - if (RealMask[0] != 0 || ImagMask[0] != 1) { - LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); - return nullptr; - } - - // Type checking, the shuffle type should be a vector type of the same - // scalar type, but half the size - auto CheckType = [&](ShuffleVectorInst *Shuffle) { - Value *Op = Shuffle->getOperand(0); - auto *ShuffleTy = cast(Shuffle->getType()); - auto *OpTy = cast(Op->getType()); - - if (OpTy->getScalarType() != ShuffleTy->getScalarType()) - return false; - if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) - return false; - - return true; - }; - - auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { - if (!CheckType(Shuffle)) - return false; - - ArrayRef Mask = Shuffle->getShuffleMask(); - int Last = *Mask.rbegin(); - - Value *Op = Shuffle->getOperand(0); - auto *OpTy = cast(Op->getType()); - int NumElements = OpTy->getNumElements(); - - // Ensure that the deinterleaving shuffle only pulls from the first - // shuffle operand. - return Last < NumElements; - }; - - if (RealShuffle->getType() != ImagShuffle->getType()) { - LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); - return nullptr; - } - if (!CheckDeinterleavingShuffle(RealShuffle)) { - LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); - return nullptr; - } - if (!CheckDeinterleavingShuffle(ImagShuffle)) { - LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); - return nullptr; - } - - NodePtr PlaceholderNode = - prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle, - RealShuffle, ImagShuffle); - PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); - FinalInstructions.insert(RealShuffle); - FinalInstructions.insert(ImagShuffle); - return submitCompositeNode(PlaceholderNode); - } - if (RealShuffle || ImagShuffle) { - LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n"); - return nullptr; - } + NodePtr Node = identifyDeinterleave(Real, Imag); + if (Node) + return Node; - auto *VTy = cast(Real->getType()); - auto *NewVTy = - FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + auto *VTy = cast(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); if (TL->isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation::CMulPartial, NewVTy) && @@ -862,13 +779,10 @@ } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { - Instruction *Real; - Instruction *Imag; - if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) + auto RootNode = identifyRoot(RootI); + if (!RootNode) return false; - auto RootNode = identifyNode(Real, Imag); - LLVM_DEBUG({ Function *F = RootI->getFunction(); BasicBlock *B = RootI->getParent(); @@ -877,14 +791,9 @@ dump(dbgs()); dbgs() << "\n"; }); - - if (RootNode) { - RootToNode[RootI] = RootNode; - OrderedRoots.push_back(RootI); - return true; - } - - return false; + RootToNode[RootI] = RootNode; + OrderedRoots.push_back(RootI); + return true; } bool ComplexDeinterleavingGraph::checkNodes() { @@ -960,6 +869,147 @@ return !RootToNode.empty(); } +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) { + if (auto *Intrinsic = dyn_cast(RootI)) { + if (Intrinsic->getIntrinsicID() != + Intrinsic::experimental_vector_interleave2) + return nullptr; + + auto *Real = dyn_cast(Intrinsic->getOperand(0)); + auto *Imag = dyn_cast(Intrinsic->getOperand(1)); + if (!Real || !Imag) + return nullptr; + + return identifyNode(Real, Imag); + } + + auto *SVI = dyn_cast(RootI); + if (!SVI) + return nullptr; + + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (!isInterleavingMask(SVI->getShuffleMask())) + return nullptr; + + Instruction *Real; + Instruction *Imag; + if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) + return nullptr; + + return identifyNode(Real, Imag); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, + Instruction *Imag) { + Instruction *I = nullptr; + Value *FinalValue = nullptr; + if (match(Real, m_ExtractValue<0>(m_Instruction(I))) && + match(Imag, m_ExtractValue<1>(m_Specific(I))) && + match(I, m_Intrinsic( + m_Value(FinalValue)))) { + NodePtr PlaceholderNode = prepareCompositeNode( + llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag); + PlaceholderNode->ReplacementNode = FinalValue; + FinalInstructions.insert(Real); + FinalInstructions.insert(Imag); + return submitCompositeNode(PlaceholderNode); + } + + auto *RealShuffle = dyn_cast(Real); + auto *ImagShuffle = dyn_cast(Imag); + if (!RealShuffle || !ImagShuffle) { + if (RealShuffle || ImagShuffle) + LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n"); + return nullptr; + } + + Value *RealOp1 = RealShuffle->getOperand(1); + if (!isa(RealOp1) && !isa(RealOp1)) { + LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); + return nullptr; + } + Value *ImagOp1 = ImagShuffle->getOperand(1); + if (!isa(ImagOp1) && !isa(ImagOp1)) { + LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); + return nullptr; + } + + Value *RealOp0 = RealShuffle->getOperand(0); + Value *ImagOp0 = ImagShuffle->getOperand(0); + + if (RealOp0 != ImagOp0) { + LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); + return nullptr; + } + + ArrayRef RealMask = RealShuffle->getShuffleMask(); + ArrayRef ImagMask = ImagShuffle->getShuffleMask(); + if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { + LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); + return nullptr; + } + + if (RealMask[0] != 0 || ImagMask[0] != 1) { + LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); + return nullptr; + } + + // Type checking, the shuffle type should be a vector type of the same + // scalar type, but half the size + auto CheckType = [&](ShuffleVectorInst *Shuffle) { + Value *Op = Shuffle->getOperand(0); + auto *ShuffleTy = cast(Shuffle->getType()); + auto *OpTy = cast(Op->getType()); + + if (OpTy->getScalarType() != ShuffleTy->getScalarType()) + return false; + if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) + return false; + + return true; + }; + + auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { + if (!CheckType(Shuffle)) + return false; + + ArrayRef Mask = Shuffle->getShuffleMask(); + int Last = *Mask.rbegin(); + + Value *Op = Shuffle->getOperand(0); + auto *OpTy = cast(Op->getType()); + int NumElements = OpTy->getNumElements(); + + // Ensure that the deinterleaving shuffle only pulls from the first + // shuffle operand. + return Last < NumElements; + }; + + if (RealShuffle->getType() != ImagShuffle->getType()) { + LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(RealShuffle)) { + LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(ImagShuffle)) { + LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); + return nullptr; + } + + NodePtr PlaceholderNode = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Deinterleave, + RealShuffle, ImagShuffle); + PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); + FinalInstructions.insert(RealShuffle); + FinalInstructions.insert(ImagShuffle); + return submitCompositeNode(PlaceholderNode); +} + static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, Value *InputA, Value *InputB) { Instruction *I = Node->Real; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24625,20 +24625,30 @@ } bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { - return Subtarget->hasComplxNum(); + return Subtarget->hasSVE() || Subtarget->hasComplxNum(); } bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation Operation, Type *Ty) const { - auto *VTy = dyn_cast(Ty); + auto *VTy = dyn_cast(Ty); if (!VTy) return false; + // If the vector is scalable, SVE is enabled, implying support for complex + // numbers. Otherwirse, we need to ensure complex number support is avaialble + if (!VTy->isScalableTy() && !Subtarget->hasComplxNum()) + return false; + auto *ScalarTy = VTy->getScalarType(); - unsigned NumElements = VTy->getNumElements(); + unsigned NumElements = VTy->getElementCount().getKnownMinValue(); + // We can only process vectors that have a bit size of 128 or higher (with an + // additional 64 bits for Neon). Additionally, these vectors must have a + // power-of-2 size, as we later split them into the smallest supported size + // and merging them back together after applying complex operation. unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; - if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth)) + if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) || + !llvm::isPowerOf2_32(VTyWidth)) return false; return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || @@ -24649,57 +24659,75 @@ Instruction *I, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { - FixedVectorType *Ty = cast(InputA->getType()); + VectorType *Ty = cast(InputA->getType()); + bool IsScalable = Ty->isScalableTy(); IRBuilder<> B(I); - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + unsigned TyWidth = + Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) && "Vector type must be either 64 or a power of 2 that is at least 128"); if (TyWidth > 128) { - int Stride = Ty->getNumElements() / 2; - auto SplitSeq = llvm::seq(0, Ty->getNumElements()); - auto SplitSeqVec = llvm::to_vector(SplitSeq); - ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); - ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); - - auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); - auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); - auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); - auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + int Stride = Ty->getElementCount().getKnownMinValue() / 2; + auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); + auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); + auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); + auto *UpperSplitA = + B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride)); + auto *UpperSplitB = + B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; - if (Accumulator) { - LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); - UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); + UpperSplitAcc = + B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); } - auto *LowerSplitInt = createComplexDeinterleavingIR( I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); - return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, + B.getInt64(0)); + return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + if (IsScalable) { + auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true)); + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_fcmla, Ty, + {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } + Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, Intrinsic::aarch64_neon_vcmla_rot90, Intrinsic::aarch64_neon_vcmla_rot180, Intrinsic::aarch64_neon_vcmla_rot270}; - if (Accumulator == nullptr) - Accumulator = ConstantFP::get(Ty, 0); return B.CreateIntrinsic(IdMap[(int)Rotation], Ty, {Accumulator, InputB, InputA}); } if (OperationType == ComplexDeinterleavingOperation::CAdd) { + if (IsScalable) { + auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true)); + if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || + Rotation == ComplexDeinterleavingRotation::Rotation_270) + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_fcadd, Ty, + {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); + return nullptr; + } + Intrinsic::ID IntId = Intrinsic::not_intrinsic; if (Rotation == ComplexDeinterleavingRotation::Rotation_90) IntId = Intrinsic::aarch64_neon_vcadd_rot90; diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform +define @complex_add_v4f16( %a, %b) { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpkhi z3.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d +; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z4.h +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8f16( %a, %b) { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcadd z1.h, p0/m, z1.h, z0.h, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v16f16( %a, %b) { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcadd z2.h, p0/m, z2.h, z0.h, #90 +; CHECK-NEXT: fcadd z3.h, p0/m, z3.h, z1.h, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16f16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v32f16( %a, %b) { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcadd z6.h, p0/m, z6.h, z2.h, #90 +; CHECK-NEXT: fcadd z4.h, p0/m, z4.h, z0.h, #90 +; CHECK-NEXT: fcadd z5.h, p0/m, z5.h, z1.h, #90 +; CHECK-NEXT: fcadd z7.h, p0/m, z7.h, z3.h, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv32f16( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f16() +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32f16() +declare @llvm.experimental.vector.interleave2.nxv32f16(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s target triple = "aarch64-arm-none-eabi" @@ -98,3 +99,70 @@ %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> ret <32 x half> %interleaved.vec } + +; Expected to transform +define <4 x half> @complex_add_v4f16_with_intrinsic(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_add_v4f16_with_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4h, v1.4h, v0.4h, #90 +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %a) + %a.real = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 0 + %a.imag = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 1 + %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %b) + %b.real = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 0 + %b.imag = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 1 + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = tail call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1) + ret <4 x half> %interleaved.vec +} + +; Expected to transform +define <8 x half> @complex_add_v8f16_with_intrinsic(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_add_v8f16_with_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.8h, v1.8h, v0.8h, #90 +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %a) + %a.real = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 0 + %a.imag = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 1 + %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %b) + %b.real = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 0 + %b.imag = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 1 + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = tail call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1) + ret <8 x half> %interleaved.vec +} + +; Expected to transform +define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_add_v16f16_with_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.8h, v2.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v3.8h, v1.8h, #90 +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a) + %a.real = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 0 + %a.imag = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 1 + %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %b) + %b.real = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 0 + %b.imag = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 1 + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = tail call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1) + ret <16 x half> %interleaved.vec +} + +declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) + +declare { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) + +declare { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_mul_v4f16( %a, %b) { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpkhi z3.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z4.d, z0.d, z2.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d +; CHECK-NEXT: movprfx z3, z2 +; CHECK-NEXT: fmul z3.h, p0/m, z3.h, z0.h +; CHECK-NEXT: fmla z3.h, p0/m, z1.h, z4.h +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z4.h +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: zip2 z1.d, z0.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f16( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8f16( %a, %b) { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #0 +; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f16( %5, %2) + ret %interleaved.vec +} +; Expected to transform +define @complex_mul_v16f16( %a, %b) { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0 +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 +; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #90 +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16f16( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v32f16( %a, %b) { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #0 +; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #0 +; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #0 +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0 +; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #90 +; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #90 +; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #90 +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv32f16( %5, %2) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f16() +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32f16() +declare @llvm.experimental.vector.interleave2.nxv32f16(, ) + + diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_add_v4f32( %a, %b) { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcadd z1.s, p0/m, z1.s, z0.s, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f32( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8f32( %a, %b) { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcadd z2.s, p0/m, z2.s, z0.s, #90 +; CHECK-NEXT: fcadd z3.s, p0/m, z3.s, z1.s, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f32( %0, %1) + ret %interleaved.vec +} +; Expected to transform +define @complex_add_v16f32( %a, %b) { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcadd z6.s, p0/m, z6.s, z2.s, #90 +; CHECK-NEXT: fcadd z4.s, p0/m, z4.s, z0.s, #90 +; CHECK-NEXT: fcadd z5.s, p0/m, z5.s, z1.s, #90 +; CHECK-NEXT: fcadd z7.s, p0/m, z7.s, z3.s, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16f32( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16f32() +declare @llvm.experimental.vector.interleave2.nxv16f32(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_mul_v4f32( %a, %b) { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #0 +; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f32( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8f32( %a, %b) { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0 +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 +; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #90 +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f32( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v16f32( %a, %b) { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #0 +; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #0 +; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #0 +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0 +; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #90 +; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #90 +; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #90 +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16f32( %5, %2) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16f32() +declare @llvm.experimental.vector.interleave2.nxv16f32(, ) + diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_add_v2f64( %a, %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcadd z1.d, p0/m, z1.d, z0.d, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv2f64( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v4f64( %a, %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcadd z2.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcadd z3.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8f64( %a, %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcadd z6.d, p0/m, z6.d, z2.d, #90 +; CHECK-NEXT: fcadd z4.d, p0/m, z4.d, z0.d, #90 +; CHECK-NEXT: fcadd z5.d, p0/m, z5.d, z1.d, #90 +; CHECK-NEXT: fcadd z7.d, p0/m, z7.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fsub fast %b.real, %a.imag + %1 = fadd fast %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f64( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv2f64() +declare @llvm.experimental.vector.interleave2.nxv2f64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f64() +declare @llvm.experimental.vector.interleave2.nxv8f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_mul_v2f64( %a, %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #0 +; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv2f64( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v4f64( %a, %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8f64( %a, %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = fmul fast %b.imag, %a.real + %1 = fmul fast %b.real, %a.imag + %2 = fadd fast %1, %0 + %3 = fmul fast %b.real, %a.real + %4 = fmul fast %a.imag, %b.imag + %5 = fsub fast %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8f64( %5, %2) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv2f64() +declare @llvm.experimental.vector.interleave2.nxv2f64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f64() +declare @llvm.experimental.vector.interleave2.nxv8f64(, )