diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -788,6 +788,16 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21409,3 +21409,88 @@ unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } +bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasComplxNum() && Subtarget->hasNEON(); +} +bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + if (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) + return NumElements == 4 || NumElements == 8; + if (ScalarTy->isFloatTy()) + return NumElements == 2 || NumElements == 4; + if (ScalarTy->isDoubleTy()) + return NumElements == 2; + return false; +} + +Value *AArch64TargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + auto *Ty = InputA->getType(); + + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + IRBuilder<> B(I); + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + + Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, + Intrinsic::aarch64_neon_vcmla_rot90, + Intrinsic::aarch64_neon_vcmla_rot180, + Intrinsic::aarch64_neon_vcmla_rot270}; + + const char *IdNames[4] = { + "aarch64_neon_vcmla_rot0", "aarch64_neon_vcmla_rot90", + "aarch64_neon_vcmla_rot180", "aarch64_neon_vcmla_rot270"}; + + unsigned IntIdx = Rotation / 90; + IntId = IdMap[IntIdx]; + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + dbgs() << "IntId: " << IntId << ", Name: " << IdNames[IntIdx] << ".\n"; + dbgs() << "Type: "; + Ty->dump(); + dbgs() << "Accumulator: "; + if (Accumulator) + Accumulator->dump(); + else + dbgs() << "nullptr.\n"; + dbgs() << "InputA: "; + if (InputA) + InputA->dump(); + else + dbgs() << "nullptr.\n"; + dbgs() << "InputB: "; + if (InputB) + InputB->dump(); + else + dbgs() << "nullptr.\n"; + + return B.CreateIntrinsic(IntId, Ty, {Accumulator, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + if (Rotation == 90) + IntId = Intrinsic::aarch64_neon_vcadd_rot90; + else if (Rotation == 270) + IntId = Intrinsic::aarch64_neon_vcadd_rot270; + + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -568,6 +568,10 @@ addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/IntrinsicInst.h" diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-add.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + + +define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h2, v0.h[1] +; CHECK-NEXT: mov h3, v1.h[1] +; CHECK-NEXT: fsub h1, h1, h2 +; CHECK-NEXT: fadd h0, h3, h0 +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %b.real, %a.imag + %1 = fadd fast <1 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + ret <2 x half> %interleaved.vec +} +define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4h, v1.4h, v0.4h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + ret <4 x half> %interleaved.vec +} +define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.8h, v1.8h, v0.8h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + ret <8 x half> %interleaved.vec +} +define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v4.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v5.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: fsub v2.8h, v4.8h, v0.8h +; CHECK-NEXT: fadd v1.8h, v1.8h, v5.8h +; CHECK-NEXT: zip1 v0.8h, v2.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + ret <16 x half> %interleaved.vec +} +define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v16.8h, v4.8h, v5.8h +; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp2 v2.8h, v4.8h, v5.8h +; CHECK-NEXT: uzp1 v3.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp2 v4.8h, v6.8h, v7.8h +; CHECK-NEXT: fsub v5.8h, v16.8h, v0.8h +; CHECK-NEXT: fadd v2.8h, v2.8h, v18.8h +; CHECK-NEXT: fsub v3.8h, v3.8h, v1.8h +; CHECK-NEXT: fadd v4.8h, v4.8h, v17.8h +; CHECK-NEXT: zip1 v0.8h, v5.8h, v2.8h +; CHECK-NEXT: zip2 v1.8h, v5.8h, v2.8h +; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h +; CHECK-NEXT: zip2 v3.8h, v3.8h, v4.8h +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %b.real, %a.imag + %1 = fadd fast <16 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f16-mul.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: fmul h4, h2, v0.h[0] +; CHECK-NEXT: fnmul h2, h3, h2 +; CHECK-NEXT: fmla h4, h3, v1.h[0] +; CHECK-NEXT: fmla h2, h0, v1.h[0] +; CHECK-NEXT: mov v2.h[1], v4.h[0] +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %b.imag, %a.real + %1 = fmul fast <1 x half> %b.real, %a.imag + %2 = fadd fast <1 x half> %1, %0 + %3 = fmul fast <1 x half> %b.real, %a.real + %4 = fmul fast <1 x half> %a.imag, %b.imag + %5 = fsub fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> + ret <2 x half> %interleaved.vec +} + +define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #90 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #0 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %b.imag, %a.real + %1 = fmul fast <2 x half> %b.real, %a.imag + %2 = fadd fast <2 x half> %1, %0 + %3 = fmul fast <2 x half> %b.real, %a.real + %4 = fmul fast <2 x half> %a.imag, %b.imag + %5 = fsub fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> + ret <4 x half> %interleaved.vec +} + +define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #90 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #0 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %b.imag, %a.real + %1 = fmul fast <4 x half> %b.real, %a.imag + %2 = fadd fast <4 x half> %1, %0 + %3 = fmul fast <4 x half> %b.real, %a.real + %4 = fmul fast <4 x half> %a.imag, %b.imag + %5 = fsub fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> + ret <8 x half> %interleaved.vec +} + +define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v4.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp2 v5.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: fmul v1.8h, v5.8h, v4.8h +; CHECK-NEXT: fmul v3.8h, v4.8h, v0.8h +; CHECK-NEXT: fneg v1.8h, v1.8h +; CHECK-NEXT: fmla v3.8h, v5.8h, v2.8h +; CHECK-NEXT: fmla v1.8h, v0.8h, v2.8h +; CHECK-NEXT: zip1 v0.8h, v1.8h, v3.8h +; CHECK-NEXT: zip2 v1.8h, v1.8h, v3.8h +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %b.imag, %a.real + %1 = fmul fast <8 x half> %b.real, %a.imag + %2 = fadd fast <8 x half> %1, %0 + %3 = fmul fast <8 x half> %b.real, %a.real + %4 = fmul fast <8 x half> %a.imag, %b.imag + %5 = fsub fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> + ret <16 x half> %interleaved.vec +} + +define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v16.8h, v4.8h, v5.8h +; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp2 v2.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; CHECK-NEXT: fmul v4.8h, v0.8h, v16.8h +; CHECK-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; CHECK-NEXT: fmul v6.8h, v1.8h, v2.8h +; CHECK-NEXT: fmul v7.8h, v16.8h, v18.8h +; CHECK-NEXT: fneg v4.8h, v4.8h +; CHECK-NEXT: fmul v16.8h, v2.8h, v17.8h +; CHECK-NEXT: fneg v6.8h, v6.8h +; CHECK-NEXT: fmla v7.8h, v0.8h, v3.8h +; CHECK-NEXT: fmla v4.8h, v18.8h, v3.8h +; CHECK-NEXT: fmla v16.8h, v1.8h, v5.8h +; CHECK-NEXT: fmla v6.8h, v17.8h, v5.8h +; CHECK-NEXT: zip1 v0.8h, v4.8h, v7.8h +; CHECK-NEXT: zip2 v1.8h, v4.8h, v7.8h +; CHECK-NEXT: zip1 v2.8h, v6.8h, v16.8h +; CHECK-NEXT: zip2 v3.8h, v6.8h, v16.8h +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %b.imag, %a.real + %1 = fmul fast <16 x half> %b.real, %a.imag + %2 = fadd fast <16 x half> %1, %0 + %3 = fmul fast <16 x half> %b.real, %a.real + %4 = fmul fast <16 x half> %a.imag, %b.imag + %5 = fsub fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-add.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + + +define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.2s, v1.2s, v0.2s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %b.real, %a.imag + %1 = fadd fast <1 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + ret <2 x float> %interleaved.vec +} +define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %b.real, %a.imag + %1 = fadd fast <2 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} +define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v4.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: fsub v2.4s, v4.4s, v0.4s +; CHECK-NEXT: fadd v1.4s, v1.4s, v5.4s +; CHECK-NEXT: zip1 v0.4s, v2.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %b.real, %a.imag + %1 = fadd fast <4 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + ret <8 x float> %interleaved.vec +} +define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v16.4s, v4.4s, v5.4s +; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v4.4s, v5.4s +; CHECK-NEXT: uzp1 v3.4s, v6.4s, v7.4s +; CHECK-NEXT: uzp2 v4.4s, v6.4s, v7.4s +; CHECK-NEXT: fsub v5.4s, v16.4s, v0.4s +; CHECK-NEXT: fadd v2.4s, v2.4s, v18.4s +; CHECK-NEXT: fsub v3.4s, v3.4s, v1.4s +; CHECK-NEXT: fadd v4.4s, v4.4s, v17.4s +; CHECK-NEXT: zip1 v0.4s, v5.4s, v2.4s +; CHECK-NEXT: zip2 v1.4s, v5.4s, v2.4s +; CHECK-NEXT: zip1 v2.4s, v3.4s, v4.4s +; CHECK-NEXT: zip2 v3.4s, v3.4s, v4.4s +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %b.real, %a.imag + %1 = fadd fast <8 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f32-mul.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #90 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #0 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %b.imag, %a.real + %1 = fmul fast <1 x float> %b.real, %a.imag + %2 = fadd fast <1 x float> %1, %0 + %3 = fmul fast <1 x float> %b.real, %a.real + %4 = fmul fast <1 x float> %a.imag, %b.imag + %5 = fsub fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> + ret <2 x float> %interleaved.vec +} + +define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %a.real + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %a.real + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v4.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v1.4s, v5.4s, v4.4s +; CHECK-NEXT: fmul v3.4s, v4.4s, v0.4s +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: fmla v3.4s, v5.4s, v2.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %b.imag, %a.real + %1 = fmul fast <4 x float> %b.real, %a.imag + %2 = fadd fast <4 x float> %1, %0 + %3 = fmul fast <4 x float> %b.real, %a.real + %4 = fmul fast <4 x float> %a.imag, %b.imag + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + ret <8 x float> %interleaved.vec +} + +define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v16.4s, v4.4s, v5.4s +; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v6.4s, v7.4s +; CHECK-NEXT: uzp1 v3.4s, v4.4s, v5.4s +; CHECK-NEXT: fmul v4.4s, v0.4s, v16.4s +; CHECK-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; CHECK-NEXT: fmul v6.4s, v1.4s, v2.4s +; CHECK-NEXT: fmul v7.4s, v16.4s, v18.4s +; CHECK-NEXT: fneg v4.4s, v4.4s +; CHECK-NEXT: fmul v16.4s, v2.4s, v17.4s +; CHECK-NEXT: fneg v6.4s, v6.4s +; CHECK-NEXT: fmla v7.4s, v0.4s, v3.4s +; CHECK-NEXT: fmla v4.4s, v18.4s, v3.4s +; CHECK-NEXT: fmla v16.4s, v1.4s, v5.4s +; CHECK-NEXT: fmla v6.4s, v17.4s, v5.4s +; CHECK-NEXT: zip1 v0.4s, v4.4s, v7.4s +; CHECK-NEXT: zip2 v1.4s, v4.4s, v7.4s +; CHECK-NEXT: zip1 v2.4s, v6.4s, v16.4s +; CHECK-NEXT: zip2 v3.4s, v6.4s, v16.4s +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %b.imag, %a.real + %1 = fmul fast <8 x float> %b.real, %a.imag + %2 = fadd fast <8 x float> %1, %0 + %3 = fmul fast <8 x float> %b.real, %a.real + %4 = fmul fast <8 x float> %a.imag, %b.imag + %5 = fsub fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-add.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + + +define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.2d, v1.2d, v0.2d, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} +define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v5.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: fsub v2.2d, v4.2d, v0.2d +; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} +define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d +; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d +; CHECK-NEXT: fsub v5.2d, v16.2d, v0.2d +; CHECK-NEXT: fadd v2.2d, v2.2d, v18.2d +; CHECK-NEXT: fsub v3.2d, v3.2d, v1.2d +; CHECK-NEXT: fadd v4.2d, v4.2d, v17.2d +; CHECK-NEXT: zip1 v0.2d, v5.2d, v2.2d +; CHECK-NEXT: zip2 v1.2d, v5.2d, v2.2d +; CHECK-NEXT: zip1 v2.2d, v3.2d, v4.2d +; CHECK-NEXT: zip2 v3.2d, v3.2d, v4.2d +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ComplexArithmetic/complex-arithmetic-f64-mul.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #90 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #0 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v4.2d, v0.2d +; CHECK-NEXT: fneg v1.2d, v1.2d +; CHECK-NEXT: fmla v3.2d, v5.2d, v2.2d +; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v16.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v3.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v0.2d, v16.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v6.2d, v1.2d, v2.2d +; CHECK-NEXT: fmul v7.2d, v16.2d, v18.2d +; CHECK-NEXT: fneg v4.2d, v4.2d +; CHECK-NEXT: fmul v16.2d, v2.2d, v17.2d +; CHECK-NEXT: fneg v6.2d, v6.2d +; CHECK-NEXT: fmla v7.2d, v0.2d, v3.2d +; CHECK-NEXT: fmla v4.2d, v18.2d, v3.2d +; CHECK-NEXT: fmla v16.2d, v1.2d, v5.2d +; CHECK-NEXT: fmla v6.2d, v17.2d, v5.2d +; CHECK-NEXT: zip1 v0.2d, v4.2d, v7.2d +; CHECK-NEXT: zip2 v1.2d, v4.2d, v7.2d +; CHECK-NEXT: zip1 v2.2d, v6.2d, v16.2d +; CHECK-NEXT: zip2 v3.2d, v6.2d, v16.2d +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -74,6 +74,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: AArch64 Stack Tagging +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Interleaved Load Combine Pass