diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -812,6 +812,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23289,3 +23289,94 @@ unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } + +bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasComplxNum(); +} + +bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth)) + return false; + + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || + ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); +} + +Value *AArch64TargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator) const { + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) && + "Vector type must be either 64 or a power of 2 that is at least 128"); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + Value *LowerSplitAcc = nullptr; + Value *UpperSplitAcc = nullptr; + + if (Accumulator) { + LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + } + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, + Intrinsic::aarch64_neon_vcmla_rot90, + Intrinsic::aarch64_neon_vcmla_rot180, + Intrinsic::aarch64_neon_vcmla_rot270}; + + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + return B.CreateIntrinsic(IdMap[(int)Rotation], Ty, + {Accumulator, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90) + IntId = Intrinsic::aarch64_neon_vcadd_rot90; + else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) + IntId = Intrinsic::aarch64_neon_vcadd_rot270; + + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -589,6 +589,10 @@ addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -87,6 +87,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: AArch64 Stack Tagging +; CHECK-NEXT: Complex Deinterleaving Pass ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Interleaved Load Combine Pass diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform +define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h2, v0.h[1] +; CHECK-NEXT: mov h3, v1.h[1] +; CHECK-NEXT: fsub h1, h1, h2 +; CHECK-NEXT: fadd h0, h3, h0 +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %b.real, %a.imag + %1 = fadd fast <1 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + ret <2 x half> %interleaved.vec +} + +; Expected to transform +define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4h, v1.4h, v0.4h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + ret <4 x half> %interleaved.vec +} + +; Expected to transform +define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.8h, v1.8h, v0.8h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + ret <8 x half> %interleaved.vec +} + +; Expected to transform +define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.8h, v2.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v3.8h, v1.8h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + ret <16 x half> %interleaved.vec +} + +; Expected to transform +define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v2.8h, v6.8h, v2.8h, #90 +; CHECK-NEXT: fcadd v0.8h, v4.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v5.8h, v1.8h, #90 +; CHECK-NEXT: fcadd v3.8h, v7.8h, v3.8h, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %b.real, %a.imag + %1 = fadd fast <16 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to not transform +define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: fmul h4, h2, v0.h[0] +; CHECK-NEXT: fnmul h2, h3, h2 +; CHECK-NEXT: fmla h4, h3, v1.h[0] +; CHECK-NEXT: fmla h2, h0, v1.h[0] +; CHECK-NEXT: mov v2.h[1], v4.h[0] +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %b.imag, %a.real + %1 = fmul fast <1 x half> %b.real, %a.imag + %2 = fadd fast <1 x half> %1, %0 + %3 = fmul fast <1 x half> %b.real, %a.real + %4 = fmul fast <1 x half> %a.imag, %b.imag + %5 = fsub fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> + ret <2 x half> %interleaved.vec +} + +; Expected to transform +define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #0 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #90 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %b.imag, %a.real + %1 = fmul fast <2 x half> %b.real, %a.imag + %2 = fadd fast <2 x half> %1, %0 + %3 = fmul fast <2 x half> %b.real, %a.real + %4 = fmul fast <2 x half> %a.imag, %b.imag + %5 = fsub fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> + ret <4 x half> %interleaved.vec +} + +; Expected to transform +define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #0 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #90 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %b.imag, %a.real + %1 = fmul fast <4 x half> %b.real, %a.imag + %2 = fadd fast <4 x half> %1, %0 + %3 = fmul fast <4 x half> %b.real, %a.real + %4 = fmul fast <4 x half> %a.imag, %b.imag + %5 = fsub fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> + ret <8 x half> %interleaved.vec +} + +; Expected to transform +define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #0 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #0 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #90 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %b.imag, %a.real + %1 = fmul fast <8 x half> %b.real, %a.imag + %2 = fadd fast <8 x half> %1, %0 + %3 = fmul fast <8 x half> %b.real, %a.real + %4 = fmul fast <8 x half> %a.imag, %b.imag + %5 = fsub fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> + ret <16 x half> %interleaved.vec +} + +; Expected to transform +define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #0 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #0 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #0 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #0 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #90 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #90 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #90 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %b.imag, %a.real + %1 = fmul fast <16 x half> %b.real, %a.imag + %2 = fadd fast <16 x half> %1, %0 + %3 = fmul fast <16 x half> %b.real, %a.real + %4 = fmul fast <16 x half> %a.imag, %b.imag + %5 = fsub fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + +; Expected to transform +define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.2s, v1.2s, v0.2s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %b.real, %a.imag + %1 = fadd fast <1 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + ret <2 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %b.real, %a.imag + %1 = fadd fast <2 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v3.4s, v1.4s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %b.real, %a.imag + %1 = fadd fast <4 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + ret <8 x float> %interleaved.vec +} + +; Expected to transform +define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v2.4s, v6.4s, v2.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v4.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v5.4s, v1.4s, #90 +; CHECK-NEXT: fcadd v3.4s, v7.4s, v3.4s, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %b.real, %a.imag + %1 = fadd fast <8 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #0 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #90 +; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %b.imag, %a.real + %1 = fmul fast <1 x float> %b.real, %a.imag + %2 = fadd fast <1 x float> %1, %0 + %3 = fmul fast <1 x float> %b.real, %a.real + %4 = fmul fast <1 x float> %a.imag, %b.imag + %5 = fsub fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> + ret <2 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %a.real + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %a.real + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #0 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #90 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %b.imag, %a.real + %1 = fmul fast <4 x float> %b.real, %a.imag + %2 = fadd fast <4 x float> %1, %0 + %3 = fmul fast <4 x float> %b.real, %a.real + %4 = fmul fast <4 x float> %a.imag, %b.imag + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + ret <8 x float> %interleaved.vec +} + +; Expected to transform +define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #0 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #0 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #0 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #0 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #90 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #90 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #90 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %b.imag, %a.real + %1 = fmul fast <8 x float> %b.real, %a.imag + %2 = fadd fast <8 x float> %1, %0 + %3 = fmul fast <8 x float> %b.real, %a.real + %4 = fmul fast <8 x float> %a.imag, %b.imag + %5 = fsub fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + + +; Expected to transform +define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.2d, v1.2d, v0.2d, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to transform +define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to transform +define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v2.2d, v6.2d, v2.2d, #90 +; CHECK-NEXT: fcadd v0.2d, v4.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v5.2d, v1.2d, #90 +; CHECK-NEXT: fcadd v3.2d, v7.2d, v3.2d, #90 +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to transform +define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to transform +define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -0,0 +1,363 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: fcmla v3.4s, v4.4s, v2.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v4.4s, v2.4s, #90 +; CHECK-NEXT: mov v0.16b, v3.16b +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151 + %1 = fmul fast <2 x float> %strided.vec153, %strided.vec + %2 = fmul fast <2 x float> %strided.vec154, %strided.vec + %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151 + %4 = fadd fast <2 x float> %3, %2 + %5 = fsub fast <2 x float> %1, %0 + %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %4, %strided.vec156 + %7 = fmul fast <2 x float> %5, %strided.vec157 + %8 = fadd fast <2 x float> %6, %7 + %9 = fmul fast <2 x float> %strided.vec156, %5 + %10 = fmul fast <2 x float> %4, %strided.vec157 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: add_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fsub v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip2 v0.2s, v0.2s, v4.2s +; CHECK-NEXT: zip2 v4.2s, v2.2s, v3.2s +; CHECK-NEXT: zip1 v1.2s, v1.2s, v5.2s +; CHECK-NEXT: zip1 v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fmul v5.2s, v4.2s, v0.2s +; CHECK-NEXT: fmul v3.2s, v1.2s, v4.2s +; CHECK-NEXT: fneg v4.2s, v5.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v2.2s +; CHECK-NEXT: fmla v4.2s, v1.2s, v2.2s +; CHECK-NEXT: zip2 v1.2s, v4.2s, v3.2s +; CHECK-NEXT: zip1 v0.2s, v4.2s, v3.2s +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %0 = fsub fast <4 x float> %b, %c + %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> + %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %2 = fmul fast <2 x float> %1, %strided.vec59 + %3 = fsub fast <4 x float> %b, %a + %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> + %5 = fmul fast <2 x float> %strided.vec58, %4 + %6 = fadd fast <2 x float> %5, %2 + %7 = fmul fast <2 x float> %strided.vec58, %1 + %8 = fmul fast <2 x float> %strided.vec59, %4 + %9 = fsub fast <2 x float> %7, %8 + %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mul270_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v5.2s, v2.2s, v3.2s +; CHECK-NEXT: zip2 v2.2s, v2.2s, v3.2s +; CHECK-NEXT: zip1 v6.2s, v1.2s, v4.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v4.2s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fmul v7.2s, v6.2s, v5.2s +; CHECK-NEXT: fneg v4.2s, v7.2s +; CHECK-NEXT: zip2 v7.2s, v0.2s, v3.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v3.2s +; CHECK-NEXT: fmla v4.2s, v2.2s, v1.2s +; CHECK-NEXT: fmul v1.2s, v1.2s, v5.2s +; CHECK-NEXT: fmul v3.2s, v4.2s, v7.2s +; CHECK-NEXT: fmla v1.2s, v2.2s, v6.2s +; CHECK-NEXT: fmul v2.2s, v4.2s, v0.2s +; CHECK-NEXT: fneg v3.2s, v3.2s +; CHECK-NEXT: fmla v2.2s, v7.2s, v1.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s +; CHECK-NEXT: zip2 v1.2s, v3.2s, v2.2s +; CHECK-NEXT: zip1 v0.2s, v3.2s, v2.2s +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec84, %strided.vec + %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81 + %2 = fadd fast <2 x float> %1, %0 + %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %3 = fmul fast <2 x float> %2, %strided.vec87 + %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81 + %5 = fmul fast <2 x float> %strided.vec83, %strided.vec + %6 = fsub fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec86 + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec86 + %10 = fmul fast <2 x float> %6, %strided.vec87 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; (a * b) * a +; Expected to transform +define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_triangle: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v3.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v3.4s, #90 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + + +; d * (b * a) * (c * a) +; Expected to transform +define <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) { +; CHECK-LABEL: mul_diamond: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v6.4s, v2.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: fcmla v6.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v5.4s, v4.4s, v3.4s, #0 +; CHECK-NEXT: fcmla v5.4s, v4.4s, v3.4s, #90 +; CHECK-NEXT: fcmla v1.4s, v6.4s, v5.4s, #0 +; CHECK-NEXT: fcmla v1.4s, v6.4s, v5.4s, #90 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %a.imag, %b.real + %1 = fmul fast <2 x float> %a.real, %b.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %a.real, %b.real + %4 = fmul fast <2 x float> %b.imag, %a.imag + %5 = fsub fast <2 x float> %3, %4 + %6 = fmul fast <2 x float> %d.real, %5 + %7 = fmul fast <2 x float> %2, %d.imag + %8 = fmul fast <2 x float> %d.real, %2 + %9 = fmul fast <2 x float> %5, %d.imag + %10 = fsub fast <2 x float> %6, %7 + %11 = fadd fast <2 x float> %8, %9 + %12 = fmul fast <2 x float> %c.real, %a.imag + %13 = fmul fast <2 x float> %c.imag, %a.real + %14 = fadd fast <2 x float> %13, %12 + %15 = fmul fast <2 x float> %14, %10 + %16 = fmul fast <2 x float> %c.real, %a.real + %17 = fmul fast <2 x float> %c.imag, %a.imag + %18 = fsub fast <2 x float> %16, %17 + %19 = fmul fast <2 x float> %18, %11 + %20 = fadd fast <2 x float> %15, %19 + %21 = fmul fast <2 x float> %18, %10 + %22 = fmul fast <2 x float> %14, %11 + %23 = fsub fast <2 x float> %21, %22 + %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_add90_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v4.4s, v3.4s, #90 +; CHECK-NEXT: ret +entry: + %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + + %i6 = fmul fast <2 x float> %br, %ar + %i7 = fmul fast <2 x float> %bi, %ai + %xr = fsub fast <2 x float> %i6, %i7 + %i9 = fmul fast <2 x float> %bi, %ar + %i10 = fmul fast <2 x float> %br, %ai + %xi = fadd fast <2 x float> %i9, %i10 + + %j6 = fmul fast <2 x float> %cr, %ar + %j7 = fmul fast <2 x float> %ci, %ai + %yr = fsub fast <2 x float> %j6, %j7 + %j9 = fmul fast <2 x float> %ci, %ar + %j10 = fmul fast <2 x float> %cr, %ai + %yi = fadd fast <2 x float> %j9, %j10 + + %zr = fsub fast <2 x float> %yr, %xi + %zi = fadd fast <2 x float> %yi, %xr + %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_triangle_addmul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: zip1 v6.2s, v0.2s, v4.2s +; CHECK-NEXT: zip2 v0.2s, v0.2s, v4.2s +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: fmul v7.2s, v5.2s, v6.2s +; CHECK-NEXT: fmul v6.2s, v1.2s, v6.2s +; CHECK-NEXT: zip1 v4.2s, v2.2s, v3.2s +; CHECK-NEXT: zip2 v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fmov d3, d7 +; CHECK-NEXT: fmov d16, d6 +; CHECK-NEXT: fmls v7.2s, v0.2s, v2.2s +; CHECK-NEXT: fmla v6.2s, v0.2s, v4.2s +; CHECK-NEXT: fmls v3.2s, v0.2s, v1.2s +; CHECK-NEXT: fmla v16.2s, v0.2s, v5.2s +; CHECK-NEXT: fsub v0.2s, v7.2s, v16.2s +; CHECK-NEXT: fadd v1.2s, v6.2s, v3.2s +; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: ret +entry: + %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + + %i6 = fmul fast <2 x float> %br, %ar + %i7 = fmul fast <2 x float> %bi, %ai + %xr = fsub fast <2 x float> %i6, %i7 + %i9 = fmul fast <2 x float> %bi, %ar + %i10 = fmul fast <2 x float> %br, %ai + %xi = fadd fast <2 x float> %i9, %i10 + + ;%j6 = fmul fast <2 x float> %cr, %ar + %j7 = fmul fast <2 x float> %ci, %ai + %yr = fsub fast <2 x float> %i6, %j7 + ;%j9 = fmul fast <2 x float> %ci, %ar + %j10 = fmul fast <2 x float> %cr, %ai + %yi = fadd fast <2 x float> %i9, %j10 + + %zr = fsub fast <2 x float> %yr, %xi + %zi = fadd fast <2 x float> %yi, %xr + %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: mul_triangle_multiuses: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v4.2s, v5.2s +; CHECK-NEXT: fmul v3.2s, v1.2s, v4.2s +; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s +; CHECK-NEXT: fneg v1.2s, v3.2s +; CHECK-NEXT: fmul v3.2s, v2.2s, v4.2s +; CHECK-NEXT: fmla v1.2s, v0.2s, v5.2s +; CHECK-NEXT: fmul v5.2s, v2.2s, v0.2s +; CHECK-NEXT: fneg v3.2s, v3.2s +; CHECK-NEXT: fmla v5.2s, v4.2s, v1.2s +; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s +; CHECK-NEXT: mov v1.d[1], v2.d[0] +; CHECK-NEXT: zip2 v4.2s, v3.2s, v5.2s +; CHECK-NEXT: zip1 v0.2s, v3.2s, v5.2s +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: mov v0.d[1], v4.d[0] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> + store <4 x float> %otheruse, ptr %p + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; Expected to transform +define <4 x float> @simple_mul(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec20, %strided.vec + %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17 + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %strided.vec19, %strided.vec + %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20 + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_mul_no_contract: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s +; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v5.2s, v4.2s +; CHECK-NEXT: fmul v3.2s, v1.2s, v4.2s +; CHECK-NEXT: fmul v4.2s, v0.2s, v5.2s +; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s +; CHECK-NEXT: fsub v0.2s, v3.2s, v4.2s +; CHECK-NEXT: zip2 v1.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec20, %strided.vec + %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17 + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %strided.vec19, %strided.vec + %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20 + %5 = fsub <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @three_way_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: three_way_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v4.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v4.4s, #90 +; CHECK-NEXT: mov v0.16b, v3.16b +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec39 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec41 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec42 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec44 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec45 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec41, %strided.vec + %1 = fmul fast <2 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec45 + %4 = fmul fast <2 x float> %strided.vec42, %strided.vec + %5 = fmul fast <2 x float> %strided.vec39, %strided.vec41 + %6 = fadd fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec44 + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec44 + %10 = fmul fast <2 x float> %6, %strided.vec45 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @simple_add_90(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_90: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fsub fast <2 x float> %strided.vec19, %strided.vec17 + %1 = fadd fast <2 x float> %strided.vec20, %strided.vec + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform, fadd commutativity is not yet implemented +define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_270_false: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s +; CHECK-NEXT: fsub v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v2.2s, v1.2s, v0.2s +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fadd fast <2 x float> %strided.vec20, %strided.vec + %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19 + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define <4 x float> @simple_add_270_true(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_270_true: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcadd v0.4s, v0.4s, v1.4s, #270 +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fadd fast <2 x float> %strided.vec, %strided.vec20 + %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19 + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: add_external_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s +; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s +; CHECK-NEXT: fsub v1.2s, v4.2s, v1.2s +; CHECK-NEXT: fadd v0.2s, v0.2s, v2.2s +; CHECK-NEXT: zip2 v2.2s, v1.2s, v0.2s +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fsub fast <2 x float> %a.real, %b.imag + %1 = fadd fast <2 x float> %a.imag, %b.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + %dup = shufflevector <2 x float> %0, <2 x float> poison, <4 x i32> + %interleaved.vec2 = shufflevector <4 x float> %interleaved.vec, <4 x float> %dup, <4 x i32> + ret <4 x float> %interleaved.vec2 +} + +; Expected to transform +define <4 x float> @mul_mul_with_fneg(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_mul_with_fneg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v1.4s, v0.4s, #270 +; CHECK-NEXT: fcmla v2.4s, v1.4s, v0.4s, #180 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fneg fast <2 x float> %a.imag + %1 = fmul fast <2 x float> %b.real, %0 + %2 = fmul fast <2 x float> %a.real, %b.imag + %3 = fsub fast <2 x float> %1, %2 + %4 = fmul fast <2 x float> %b.imag, %a.imag + %5 = fmul fast <2 x float> %a.real, %b.real + %6 = fsub fast <2 x float> %4, %5 + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %3, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { +; CHECK-LABEL: abp90c12: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s21, [sp, #32] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr s23, [sp, #40] +; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: ldr s2, [sp] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: ld1 { v21.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ld1 { v23.s }[1], [x11] +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 +; CHECK-NEXT: ldr s22, [sp, #96] +; CHECK-NEXT: add x11, sp, #24 +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: ld1 { v21.s }[2], [x9] +; CHECK-NEXT: ldr s24, [sp, #8] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v23.s }[2], [x10] +; CHECK-NEXT: add x10, sp, #80 +; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 +; CHECK-NEXT: ldr s18, [sp, #128] +; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ldr s20, [sp, #104] +; CHECK-NEXT: ld1 { v24.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #88 +; CHECK-NEXT: ld1 { v22.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #144 +; CHECK-NEXT: ld1 { v21.s }[3], [x10] +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ld1 { v23.s }[3], [x11] +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: add x11, sp, #152 +; CHECK-NEXT: ld1 { v20.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: ldr s17, [sp, #136] +; CHECK-NEXT: ldr s19, [sp, #192] +; CHECK-NEXT: add x9, sp, #208 +; CHECK-NEXT: mov v0.s[3], v6.s[0] +; CHECK-NEXT: ld1 { v18.s }[2], [x10] +; CHECK-NEXT: ld1 { v17.s }[1], [x11] +; CHECK-NEXT: add x10, sp, #176 +; CHECK-NEXT: fmul v3.4s, v23.4s, v1.4s +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: fmul v4.4s, v20.4s, v24.4s +; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: fmul v1.4s, v21.4s, v1.4s +; CHECK-NEXT: ld1 { v18.s }[3], [x10] +; CHECK-NEXT: fmul v5.4s, v22.4s, v24.4s +; CHECK-NEXT: ldr s16, [sp, #200] +; CHECK-NEXT: ld1 { v17.s }[2], [x9] +; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: fneg v3.4s, v3.4s +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: fneg v4.4s, v4.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v23.4s +; CHECK-NEXT: fmla v5.4s, v2.4s, v20.4s +; CHECK-NEXT: ld1 { v16.s }[1], [x11] +; CHECK-NEXT: ld1 { v17.s }[3], [x9] +; CHECK-NEXT: fmla v3.4s, v0.4s, v21.4s +; CHECK-NEXT: fmla v4.4s, v2.4s, v22.4s +; CHECK-NEXT: fsub v0.4s, v18.4s, v1.4s +; CHECK-NEXT: fsub v1.4s, v19.4s, v5.4s +; CHECK-NEXT: fadd v2.4s, v17.4s, v3.4s +; CHECK-NEXT: fadd v3.4s, v16.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 +; CHECK-NEXT: zip2 v3.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: rev64 v4.4s, v4.4s +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: trn2 v4.4s, v4.4s, v5.4s +; CHECK-NEXT: ext v1.16b, v4.16b, v1.16b, #8 +; CHECK-NEXT: mov v3.d[1], v4.d[0] +; CHECK-NEXT: stp q3, q1, [x8, #16] +; CHECK-NEXT: ret +entry: + %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> + %ai = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> + %br = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> + %bi = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> + %cr = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> + %ci = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> + + %i6 = fmul fast <6 x float> %br, %ar + %i7 = fmul fast <6 x float> %bi, %ai + %xr = fsub fast <6 x float> %i6, %i7 + %i9 = fmul fast <6 x float> %bi, %ar + %i10 = fmul fast <6 x float> %br, %ai + %xi = fadd fast <6 x float> %i9, %i10 + + %zr = fsub fast <6 x float> %cr, %xi + %zi = fadd fast <6 x float> %ci, %xr + %interleaved.vec = shufflevector <6 x float> %zr, <6 x float> %zi, <12 x i32> + ret <12 x float> %interleaved.vec +}