diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -100,6 +100,13 @@ /// <1, 3, 5, 7>). static bool isDeinterleavingMask(ArrayRef Mask); +/// Returns true if the operation is a negation of V, and it works for both +/// integers and floats. +static bool isNeg(Value *V); + +/// Returns the operand for negation operation. +static Value *getNegOperand(Value *V); + namespace { class ComplexDeinterleavingLegacyPass : public FunctionPass { @@ -146,7 +153,7 @@ // This two members are required exclusively for generating // ComplexDeinterleavingOperation::Symmetric operations. unsigned Opcode; - FastMathFlags Flags; + std::optional Flags; ComplexDeinterleavingRotation Rotation = ComplexDeinterleavingRotation::Rotation_0; @@ -333,7 +340,8 @@ /// Return nullptr if it is not possible to construct a complex number. /// \p Flags are needed to generate symmetric Add and Sub operations. NodePtr identifyAdditions(std::list &RealAddends, - std::list &ImagAddends, FastMathFlags Flags, + std::list &ImagAddends, + std::optional Flags, NodePtr Accumulator); /// Extract one addend that have both real and imaginary parts positive. @@ -512,6 +520,19 @@ return true; } +bool isNeg(Value *V) { + return match(V, m_FNeg(m_Value())) || match(V, m_Neg(m_Value())); +} + +Value *getNegOperand(Value *V) { + assert(isNeg(V)); + auto *I = cast(V); + if (I->getOpcode() == Instruction::FNeg) + return I->getOperand(0); + + return I->getOperand(1); +} + bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { ComplexDeinterleavingGraph Graph(TL, TLI); if (Graph.collectPotentialReductions(B)) @@ -540,9 +561,12 @@ return nullptr; } - if (Real->getOpcode() != Instruction::FMul || - Imag->getOpcode() != Instruction::FMul) { - LLVM_DEBUG(dbgs() << " - Real or imaginary instruction is not fmul\n"); + if ((Real->getOpcode() != Instruction::FMul && + Real->getOpcode() != Instruction::Mul) || + (Imag->getOpcode() != Instruction::FMul && + Imag->getOpcode() != Instruction::Mul)) { + LLVM_DEBUG( + dbgs() << " - Real or imaginary instruction is not fmul or mul\n"); return nullptr; } @@ -563,7 +587,7 @@ R1 = Op; } - if (match(I0, m_Neg(m_Value(Op)))) { + if (isNeg(I0)) { Negs |= 2; Negs ^= 1; I0 = Op; @@ -634,26 +658,29 @@ LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag << "\n"); // Determine rotation + auto IsAdd = [](unsigned Op) { + return Op == Instruction::FAdd || Op == Instruction::Add; + }; + auto IsSub = [](unsigned Op) { + return Op == Instruction::FSub || Op == Instruction::Sub; + }; ComplexDeinterleavingRotation Rotation; - if (Real->getOpcode() == Instruction::FAdd && - Imag->getOpcode() == Instruction::FAdd) + if (IsAdd(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_0; - else if (Real->getOpcode() == Instruction::FSub && - Imag->getOpcode() == Instruction::FAdd) + else if (IsSub(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_90; - else if (Real->getOpcode() == Instruction::FSub && - Imag->getOpcode() == Instruction::FSub) + else if (IsSub(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_180; - else if (Real->getOpcode() == Instruction::FAdd && - Imag->getOpcode() == Instruction::FSub) + else if (IsAdd(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_270; else { LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n"); return nullptr; } - if (!Real->getFastMathFlags().allowContract() || - !Imag->getFastMathFlags().allowContract()) { + if (isa(Real) && + (!Real->getFastMathFlags().allowContract() || + !Imag->getFastMathFlags().allowContract())) { LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n"); return nullptr; } @@ -816,6 +843,9 @@ case Instruction::FSub: case Instruction::FMul: case Instruction::FNeg: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: return true; default: return false; @@ -925,27 +955,31 @@ ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, Instruction *Imag) { + auto IsOperationSupported = [](unsigned Opcode) -> bool { + return Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FNeg || Opcode == Instruction::Add || + Opcode == Instruction::Sub; + }; - if ((Real->getOpcode() != Instruction::FAdd && - Real->getOpcode() != Instruction::FSub && - Real->getOpcode() != Instruction::FNeg) || - (Imag->getOpcode() != Instruction::FAdd && - Imag->getOpcode() != Instruction::FSub && - Imag->getOpcode() != Instruction::FNeg)) + if (!IsOperationSupported(Real->getOpcode()) || + !IsOperationSupported(Imag->getOpcode())) return nullptr; - if (Real->getFastMathFlags() != Imag->getFastMathFlags()) { - LLVM_DEBUG( - dbgs() - << "The flags in Real and Imaginary instructions are not identical\n"); - return nullptr; - } + std::optional Flags; + if (isa(Real)) { + if (Real->getFastMathFlags() != Imag->getFastMathFlags()) { + LLVM_DEBUG(dbgs() << "The flags in Real and Imaginary instructions are " + "not identical\n"); + return nullptr; + } - FastMathFlags Flags = Real->getFastMathFlags(); - if (!Flags.allowReassoc()) { - LLVM_DEBUG( - dbgs() << "the 'Reassoc' attribute is missing in the FastMath flags\n"); - return nullptr; + Flags = Real->getFastMathFlags(); + if (!Flags->allowReassoc()) { + LLVM_DEBUG( + dbgs() + << "the 'Reassoc' attribute is missing in the FastMath flags\n"); + return nullptr; + } } // Collect multiplications and addend instructions from the given instruction @@ -978,35 +1012,52 @@ Addends.emplace_back(I, IsPositive); continue; } - - if (I->getOpcode() == Instruction::FAdd) { + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::Add: Worklist.emplace_back(I->getOperand(1), IsPositive); Worklist.emplace_back(I->getOperand(0), IsPositive); - } else if (I->getOpcode() == Instruction::FSub) { + break; + case Instruction::FSub: Worklist.emplace_back(I->getOperand(1), !IsPositive); Worklist.emplace_back(I->getOperand(0), IsPositive); - } else if (I->getOpcode() == Instruction::FMul) { + break; + case Instruction::Sub: + if (isNeg(I)) { + Worklist.emplace_back(getNegOperand(I), !IsPositive); + } else { + Worklist.emplace_back(I->getOperand(1), !IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } + break; + case Instruction::FMul: + case Instruction::Mul: { Value *A, *B; - if (match(I->getOperand(0), m_FNeg(m_Value(A)))) { + if (isNeg(I->getOperand(0))) { + A = getNegOperand(I->getOperand(0)); IsPositive = !IsPositive; } else { A = I->getOperand(0); } - if (match(I->getOperand(1), m_FNeg(m_Value(B)))) { + if (isNeg(I->getOperand(1))) { + B = getNegOperand(I->getOperand(1)); IsPositive = !IsPositive; } else { B = I->getOperand(1); } Muls.push_back(Product{A, B, IsPositive}); - } else if (I->getOpcode() == Instruction::FNeg) { + break; + } + case Instruction::FNeg: Worklist.emplace_back(I->getOperand(0), !IsPositive); - } else { + break; + default: Addends.emplace_back(I, IsPositive); continue; } - if (I->getFastMathFlags() != Flags) { + if (Flags && I->getFastMathFlags() != *Flags) { LLVM_DEBUG(dbgs() << "The instruction's fast math flags are " "inconsistent with the root instructions' flags: " << *I << "\n"); @@ -1258,10 +1309,9 @@ } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyAdditions(std::list &RealAddends, - std::list &ImagAddends, - FastMathFlags Flags, - NodePtr Accumulator = nullptr) { +ComplexDeinterleavingGraph::identifyAdditions( + std::list &RealAddends, std::list &ImagAddends, + std::optional Flags, NodePtr Accumulator = nullptr) { if (RealAddends.size() != ImagAddends.size()) return nullptr; @@ -1312,14 +1362,22 @@ if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) { TmpNode = prepareCompositeNode( ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); - TmpNode->Opcode = Instruction::FAdd; - TmpNode->Flags = Flags; + if (Flags) { + TmpNode->Opcode = Instruction::FAdd; + TmpNode->Flags = *Flags; + } else { + TmpNode->Opcode = Instruction::Add; + } } else if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_180) { TmpNode = prepareCompositeNode( ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); - TmpNode->Opcode = Instruction::FSub; - TmpNode->Flags = Flags; + if (Flags) { + TmpNode->Opcode = Instruction::FSub; + TmpNode->Flags = *Flags; + } else { + TmpNode->Opcode = Instruction::Sub; + } } else { TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, nullptr, nullptr); @@ -1815,8 +1873,8 @@ } static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode, - FastMathFlags Flags, Value *InputA, - Value *InputB) { + std::optional Flags, + Value *InputA, Value *InputB) { Value *I; switch (Opcode) { case Instruction::FNeg: @@ -1825,16 +1883,26 @@ case Instruction::FAdd: I = B.CreateFAdd(InputA, InputB); break; + case Instruction::Add: + I = B.CreateAdd(InputA, InputB); + break; case Instruction::FSub: I = B.CreateFSub(InputA, InputB); break; + case Instruction::Sub: + I = B.CreateSub(InputA, InputB); + break; case Instruction::FMul: I = B.CreateFMul(InputA, InputB); break; + case Instruction::Mul: + I = B.CreateMul(InputA, InputB); + break; default: llvm_unreachable("Incorrect symmetric opcode"); } - cast(I)->setFastMathFlags(Flags); + if (Flags) + cast(I)->setFastMathFlags(*Flags); return I; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25858,7 +25858,8 @@ } bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { - return Subtarget->hasSVE() || Subtarget->hasComplxNum(); + return Subtarget->hasSVE() || Subtarget->hasSVE2() || + Subtarget->hasComplxNum(); } bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( @@ -25884,6 +25885,11 @@ !llvm::isPowerOf2_32(VTyWidth)) return false; + if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) { + unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); + return 8 <= ScalarWidth && ScalarWidth <= 64; + } + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } @@ -25894,6 +25900,7 @@ Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); bool IsScalable = Ty->isScalableTy(); + bool IsInt = Ty->getElementType()->isIntegerTy(); unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); @@ -25929,10 +25936,15 @@ if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { if (Accumulator == nullptr) - Accumulator = ConstantFP::get(Ty, 0); + Accumulator = Constant::getNullValue(Ty); if (IsScalable) { - auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true)); + if (IsInt) + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cmla_x, Ty, + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + + auto *Mask = B.getAllOnesMask(Ty->getElementCount()); return B.CreateIntrinsic( Intrinsic::aarch64_sve_fcmla, Ty, {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); @@ -25950,12 +25962,18 @@ if (OperationType == ComplexDeinterleavingOperation::CAdd) { if (IsScalable) { - auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true)); if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || - Rotation == ComplexDeinterleavingRotation::Rotation_270) + Rotation == ComplexDeinterleavingRotation::Rotation_270) { + if (IsInt) + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cadd_x, Ty, + {InputA, InputB, B.getInt32((int)Rotation * 90)}); + + auto *Mask = B.getAllOnesMask(Ty->getElementCount()); return B.CreateIntrinsic( Intrinsic::aarch64_sve_fcadd, Ty, {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform as the type's minimum size is less than 128 bits. +define @complex_add_v4i16( %a, %b) { +; CHECK-LABEL: complex_add_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpkhi z3.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z2.d, z4.d +; CHECK-NEXT: zip2 z2.d, z0.d, z1.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8i16( %a, %b) { +; CHECK-LABEL: complex_add_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z1.h, z1.h, z0.h, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v16i16( %a, %b) { +; CHECK-LABEL: complex_add_v16i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z2.h, z2.h, z0.h, #90 +; CHECK-NEXT: cadd z3.h, z3.h, z1.h, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16i16( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v32i16( %a, %b) { +; CHECK-LABEL: complex_add_v32i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z6.h, z6.h, z2.h, #90 +; CHECK-NEXT: cadd z4.h, z4.h, z0.h, #90 +; CHECK-NEXT: cadd z5.h, z5.h, z1.h, #90 +; CHECK-NEXT: cadd z7.h, z7.h, z3.h, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv32i16( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i16() +declare @llvm.experimental.vector.interleave2.nxv4i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i16() +declare @llvm.experimental.vector.interleave2.nxv8i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i16() +declare @llvm.experimental.vector.interleave2.nxv32i16(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform as the type's minimum size is less than 128 bits. +define @complex_mul_v4i16( %a, %b) { +; CHECK-LABEL: complex_mul_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpkhi z3.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z3.d, z1.d, z0.d +; CHECK-NEXT: mul z1.d, z1.d, z4.d +; CHECK-NEXT: mla z3.d, p0/m, z2.d, z4.d +; CHECK-NEXT: msb z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: zip2 z1.d, z0.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i16( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8i16( %a, %b) { +; CHECK-LABEL: complex_mul_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #0 +; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i16( %5, %2) + ret %interleaved.vec +} +; Expected to transform +define @complex_mul_v16i16( %a, %b) { +; CHECK-LABEL: complex_mul_v16i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.h, #0 // =0x0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: cmla z4.h, z3.h, z1.h, #0 +; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #0 +; CHECK-NEXT: cmla z4.h, z3.h, z1.h, #90 +; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16i16( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v32i16( %a, %b) { +; CHECK-LABEL: complex_mul_v32i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.h, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: cmla z25.h, z4.h, z0.h, #0 +; CHECK-NEXT: cmla z26.h, z5.h, z1.h, #0 +; CHECK-NEXT: cmla z27.h, z6.h, z2.h, #0 +; CHECK-NEXT: cmla z24.h, z7.h, z3.h, #0 +; CHECK-NEXT: cmla z25.h, z4.h, z0.h, #90 +; CHECK-NEXT: cmla z26.h, z5.h, z1.h, #90 +; CHECK-NEXT: cmla z27.h, z6.h, z2.h, #90 +; CHECK-NEXT: cmla z24.h, z7.h, z3.h, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i16( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i16( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv32i16( %5, %2) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i16() +declare @llvm.experimental.vector.interleave2.nxv4i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i16() +declare @llvm.experimental.vector.interleave2.nxv8i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i16() +declare @llvm.experimental.vector.interleave2.nxv32i16(, ) + + diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_add_v4i32( %a, %b) { +; CHECK-LABEL: complex_add_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z1.s, z1.s, z0.s, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i32( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8i32( %a, %b) { +; CHECK-LABEL: complex_add_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z2.s, z2.s, z0.s, #90 +; CHECK-NEXT: cadd z3.s, z3.s, z1.s, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i32( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v16i32( %a, %b) { +; CHECK-LABEL: complex_add_v16i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z6.s, z6.s, z2.s, #90 +; CHECK-NEXT: cadd z4.s, z4.s, z0.s, #90 +; CHECK-NEXT: cadd z5.s, z5.s, z1.s, #90 +; CHECK-NEXT: cadd z7.s, z7.s, z3.s, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16i32( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i32() +declare @llvm.experimental.vector.interleave2.nxv4i32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i32() +declare @llvm.experimental.vector.interleave2.nxv16i32(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_mul_v4i32( %a, %b) { +; CHECK-LABEL: complex_mul_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #0 +; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i32( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8i32( %a, %b) { +; CHECK-LABEL: complex_mul_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: cmla z4.s, z3.s, z1.s, #0 +; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #0 +; CHECK-NEXT: cmla z4.s, z3.s, z1.s, #90 +; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i32( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v16i32( %a, %b) { +; CHECK-LABEL: complex_mul_v16i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.s, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: cmla z25.s, z4.s, z0.s, #0 +; CHECK-NEXT: cmla z26.s, z5.s, z1.s, #0 +; CHECK-NEXT: cmla z27.s, z6.s, z2.s, #0 +; CHECK-NEXT: cmla z24.s, z7.s, z3.s, #0 +; CHECK-NEXT: cmla z25.s, z4.s, z0.s, #90 +; CHECK-NEXT: cmla z26.s, z5.s, z1.s, #90 +; CHECK-NEXT: cmla z27.s, z6.s, z2.s, #90 +; CHECK-NEXT: cmla z24.s, z7.s, z3.s, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16i32( %5, %2) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i32() +declare @llvm.experimental.vector.interleave2.nxv4i32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i32() +declare @llvm.experimental.vector.interleave2.nxv16i32(, ) + diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_add_v2i64( %a, %b) { +; CHECK-LABEL: complex_add_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z1.d, z1.d, z0.d, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv2i64( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v4i64( %a, %b) { +; CHECK-LABEL: complex_add_v4i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z2.d, z2.d, z0.d, #90 +; CHECK-NEXT: cadd z3.d, z3.d, z1.d, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i64( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v8i64( %a, %b) { +; CHECK-LABEL: complex_add_v8i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z6.d, z6.d, z2.d, #90 +; CHECK-NEXT: cadd z4.d, z4.d, z0.d, #90 +; CHECK-NEXT: cadd z5.d, z5.d, z1.d, #90 +; CHECK-NEXT: cadd z7.d, z7.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z2.d, z6.d +; CHECK-NEXT: mov z3.d, z7.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i64( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv2i64() +declare @llvm.experimental.vector.interleave2.nxv2i64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i64() +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i64() +declare @llvm.experimental.vector.interleave2.nxv8i64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define @complex_mul_v2i64( %a, %b) { +; CHECK-LABEL: complex_mul_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #0 +; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv2i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv2i64( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v4i64( %a, %b) { +; CHECK-LABEL: complex_mul_v4i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: cmla z4.d, z3.d, z1.d, #0 +; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #0 +; CHECK-NEXT: cmla z4.d, z3.d, z1.d, #90 +; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #90 +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4i64( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_mul_v8i64( %a, %b) { +; CHECK-LABEL: complex_mul_v8i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #0 +; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #0 +; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #0 +; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #0 +; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #90 +; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #90 +; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #90 +; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = mul %b.imag, %a.real + %1 = mul %b.real, %a.imag + %2 = add %1, %0 + %3 = mul %b.real, %a.real + %4 = mul %a.imag, %b.imag + %5 = sub %3, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i64( %5, %2) + ret %interleaved.vec +} + +; Expected to transform +define @complex_minus_mul_v8i64( %a, %b) { +; CHECK-LABEL: complex_minus_mul_v8i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #270 +; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #270 +; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #270 +; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #270 +; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #180 +; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #180 +; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #180 +; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #180 +; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: mov z1.d, z26.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: mov z3.d, z24.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %0 = sub zeroinitializer, %a.real + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i64( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %1 = mul %b.real, %0 + %2 = mul %b.imag, %a.imag + %3 = add %2, %1 + %4 = mul %b.real, %a.imag + %5 = mul %b.imag, %0 + %6 = sub %5, %4 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i64( %3, %6) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv2i64() +declare @llvm.experimental.vector.interleave2.nxv2i64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i64() +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i64() +declare @llvm.experimental.vector.interleave2.nxv8i64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform as the type's minimum size is less than 128 bits. +define @complex_add_v8i8( %a, %b) { +; CHECK-LABEL: complex_add_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z4.s, z0.s, z2.s +; CHECK-NEXT: uzp2 z0.s, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z1.s, z3.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z3.s +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: add z1.s, z2.s, z4.s +; CHECK-NEXT: zip2 z2.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i8( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i8( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i8( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v16i8( %a, %b) { +; CHECK-LABEL: complex_add_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z1.b, z1.b, z0.b, #90 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i8( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i8( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv16i8( %0, %1) + ret %interleaved.vec +} + +; Expected to transform +define @complex_add_v32i8( %a, %b) { +; CHECK-LABEL: complex_add_v32i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cadd z2.b, z2.b, z0.b, #90 +; CHECK-NEXT: cadd z3.b, z3.b, z1.b, #90 +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: ret +entry: + %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( %a) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( %b) + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %0 = sub %b.real, %a.imag + %1 = add %b.imag, %a.real + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv32i8( %0, %1) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i8() +declare @llvm.experimental.vector.interleave2.nxv8i8(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i8() +declare @llvm.experimental.vector.interleave2.nxv16i8(, ) + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare @llvm.experimental.vector.interleave2.nxv32i8(, )