diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -796,6 +796,17 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, + Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22230,3 +22230,88 @@ unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } +bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasComplxNum() && Subtarget->hasNEON(); +} +bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth)) + return false; + + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); +} + +Value *AArch64TargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128 || TyWidth == 64); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + Value *LowerSplitAcc = nullptr; + Value *UpperSplitAcc = nullptr; + + if (Accumulator) { + LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + } + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, + Intrinsic::aarch64_neon_vcmla_rot90, + Intrinsic::aarch64_neon_vcmla_rot180, + Intrinsic::aarch64_neon_vcmla_rot270}; + + return B.CreateIntrinsic(IdMap[(int) Rotation], Ty, {Accumulator, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90) + IntId = Intrinsic::aarch64_neon_vcadd_rot90; + else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) + IntId = Intrinsic::aarch64_neon_vcadd_rot270; + + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -582,6 +582,10 @@ addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -85,6 +85,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: AArch64 Stack Tagging +; CHECK-NEXT: Complex Deinterleaving Pass ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Interleaved Load Combine Pass diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll @@ -31,13 +31,7 @@ define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-LABEL: complex_add_v4f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp2 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp1 v3.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp2 v1.4h, v1.4h, v0.4h -; CHECK-NEXT: fsub v0.4h, v3.4h, v0.4h -; CHECK-NEXT: fadd v1.4h, v1.4h, v2.4h -; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: fcadd v0.4h, v1.4h, v0.4h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> @@ -54,17 +48,7 @@ define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_add_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: uzp1 v4.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp2 v0.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp1 v2.4h, v1.4h, v3.4h -; CHECK-NEXT: uzp2 v1.4h, v1.4h, v3.4h -; CHECK-NEXT: fsub v0.4h, v2.4h, v0.4h -; CHECK-NEXT: fadd v1.4h, v1.4h, v4.4h -; CHECK-NEXT: zip2 v2.4h, v0.4h, v1.4h -; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: fcadd v0.8h, v1.8h, v0.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> @@ -81,14 +65,8 @@ define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_add_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v4.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v5.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: fsub v2.8h, v4.8h, v0.8h -; CHECK-NEXT: fadd v1.8h, v1.8h, v5.8h -; CHECK-NEXT: zip1 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: zip2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: fcadd v0.8h, v2.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v3.8h, v1.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> @@ -105,22 +83,10 @@ define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_add_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v16.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v2.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp2 v4.8h, v6.8h, v7.8h -; CHECK-NEXT: fsub v5.8h, v16.8h, v0.8h -; CHECK-NEXT: fadd v2.8h, v2.8h, v18.8h -; CHECK-NEXT: fsub v3.8h, v3.8h, v1.8h -; CHECK-NEXT: fadd v4.8h, v4.8h, v17.8h -; CHECK-NEXT: zip1 v0.8h, v5.8h, v2.8h -; CHECK-NEXT: zip2 v1.8h, v5.8h, v2.8h -; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h -; CHECK-NEXT: zip2 v3.8h, v3.8h, v4.8h +; CHECK-NEXT: fcadd v2.8h, v6.8h, v2.8h, #90 +; CHECK-NEXT: fcadd v0.8h, v4.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v5.8h, v1.8h, #90 +; CHECK-NEXT: fcadd v3.8h, v7.8h, v3.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll @@ -37,16 +37,10 @@ define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-LABEL: complex_mul_v4f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp2 v3.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: fmul v4.4h, v2.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v0.4h -; CHECK-NEXT: fmul v3.4h, v3.4h, v0.4h -; CHECK-NEXT: fneg v4.4h, v4.4h -; CHECK-NEXT: fmla v3.4h, v2.4h, v1.4h -; CHECK-NEXT: fmla v4.4h, v0.4h, v1.4h -; CHECK-NEXT: zip1 v0.4h, v4.4h, v3.4h +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #0 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #90 +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> @@ -67,20 +61,10 @@ define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: uzp2 v4.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp2 v5.4h, v1.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v3.4h -; CHECK-NEXT: fmul v2.4h, v4.4h, v5.4h -; CHECK-NEXT: fmul v3.4h, v5.4h, v0.4h -; CHECK-NEXT: fneg v2.4h, v2.4h -; CHECK-NEXT: fmla v3.4h, v4.4h, v1.4h -; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h -; CHECK-NEXT: zip2 v1.4h, v2.4h, v3.4h -; CHECK-NEXT: zip1 v0.4h, v2.4h, v3.4h -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #0 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -101,17 +85,14 @@ define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v4.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v5.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-NEXT: fmul v1.8h, v5.8h, v4.8h -; CHECK-NEXT: fmul v3.8h, v4.8h, v0.8h -; CHECK-NEXT: fneg v1.8h, v1.8h -; CHECK-NEXT: fmla v3.8h, v5.8h, v2.8h -; CHECK-NEXT: fmla v1.8h, v0.8h, v2.8h -; CHECK-NEXT: zip1 v0.8h, v1.8h, v3.8h -; CHECK-NEXT: zip2 v1.8h, v1.8h, v3.8h +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #0 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #0 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #90 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> @@ -132,28 +113,22 @@ define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v16.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v2.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h -; CHECK-NEXT: fmul v4.8h, v0.8h, v16.8h -; CHECK-NEXT: uzp1 v5.8h, v6.8h, v7.8h -; CHECK-NEXT: fmul v6.8h, v1.8h, v2.8h -; CHECK-NEXT: fmul v7.8h, v16.8h, v18.8h -; CHECK-NEXT: fneg v4.8h, v4.8h -; CHECK-NEXT: fmul v16.8h, v2.8h, v17.8h -; CHECK-NEXT: fneg v6.8h, v6.8h -; CHECK-NEXT: fmla v7.8h, v0.8h, v3.8h -; CHECK-NEXT: fmla v4.8h, v18.8h, v3.8h -; CHECK-NEXT: fmla v16.8h, v1.8h, v5.8h -; CHECK-NEXT: fmla v6.8h, v17.8h, v5.8h -; CHECK-NEXT: zip1 v0.8h, v4.8h, v7.8h -; CHECK-NEXT: zip2 v1.8h, v4.8h, v7.8h -; CHECK-NEXT: zip1 v2.8h, v6.8h, v16.8h -; CHECK-NEXT: zip2 v3.8h, v6.8h, v16.8h +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #0 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #0 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #0 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #0 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #90 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #90 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #90 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll @@ -8,13 +8,7 @@ define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: complex_add_v2f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2s, v0.s[1] -; CHECK-NEXT: dup v3.2s, v1.s[1] -; CHECK-NEXT: fsub v1.2s, v1.2s, v2.2s -; CHECK-NEXT: fadd v0.2s, v3.2s, v0.2s -; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: fcadd v0.2s, v1.2s, v0.2s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> @@ -31,17 +25,7 @@ define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_add_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fsub v0.2s, v2.2s, v0.2s -; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s -; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: fcadd v0.4s, v1.4s, v0.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> @@ -58,14 +42,8 @@ define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_add_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: fsub v2.4s, v4.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v1.4s, v5.4s -; CHECK-NEXT: zip1 v0.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: fcadd v0.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v3.4s, v1.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> @@ -82,22 +60,10 @@ define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: complex_add_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v16.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v3.4s, v6.4s, v7.4s -; CHECK-NEXT: uzp2 v4.4s, v6.4s, v7.4s -; CHECK-NEXT: fsub v5.4s, v16.4s, v0.4s -; CHECK-NEXT: fadd v2.4s, v2.4s, v18.4s -; CHECK-NEXT: fsub v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fadd v4.4s, v4.4s, v17.4s -; CHECK-NEXT: zip1 v0.4s, v5.4s, v2.4s -; CHECK-NEXT: zip2 v1.4s, v5.4s, v2.4s -; CHECK-NEXT: zip1 v2.4s, v3.4s, v4.4s -; CHECK-NEXT: zip2 v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fcadd v2.4s, v6.4s, v2.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v4.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v5.4s, v1.4s, #90 +; CHECK-NEXT: fcadd v3.4s, v7.4s, v3.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll @@ -7,16 +7,10 @@ define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: complex_mul_v2f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: dup v2.2s, v1.s[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v3.2s, v0.s[1] -; CHECK-NEXT: fmul v4.2s, v0.2s, v1.s[1] -; CHECK-NEXT: fmul v2.2s, v2.2s, v0.s[1] -; CHECK-NEXT: fmla v4.2s, v3.2s, v1.2s -; CHECK-NEXT: fneg v2.2s, v2.2s -; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v4.2s +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #0 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #90 +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> @@ -37,20 +31,10 @@ define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s -; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fmul v2.2s, v4.2s, v5.2s -; CHECK-NEXT: fmul v3.2s, v5.2s, v0.2s -; CHECK-NEXT: fneg v2.2s, v2.2s -; CHECK-NEXT: fmla v3.2s, v4.2s, v1.2s -; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip2 v1.2s, v2.2s, v3.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v3.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -71,17 +55,14 @@ define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v1.4s, v5.4s, v4.4s -; CHECK-NEXT: fmul v3.4s, v4.4s, v0.4s -; CHECK-NEXT: fneg v1.4s, v1.4s -; CHECK-NEXT: fmla v3.4s, v5.4s, v2.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: zip1 v0.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #0 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #90 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> @@ -102,28 +83,22 @@ define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v16.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v6.4s, v7.4s -; CHECK-NEXT: uzp1 v3.4s, v4.4s, v5.4s -; CHECK-NEXT: fmul v4.4s, v0.4s, v16.4s -; CHECK-NEXT: uzp1 v5.4s, v6.4s, v7.4s -; CHECK-NEXT: fmul v6.4s, v1.4s, v2.4s -; CHECK-NEXT: fmul v7.4s, v16.4s, v18.4s -; CHECK-NEXT: fneg v4.4s, v4.4s -; CHECK-NEXT: fmul v16.4s, v2.4s, v17.4s -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fmla v7.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v4.4s, v18.4s, v3.4s -; CHECK-NEXT: fmla v16.4s, v1.4s, v5.4s -; CHECK-NEXT: fmla v6.4s, v17.4s, v5.4s -; CHECK-NEXT: zip1 v0.4s, v4.4s, v7.4s -; CHECK-NEXT: zip2 v1.4s, v4.4s, v7.4s -; CHECK-NEXT: zip1 v2.4s, v6.4s, v16.4s -; CHECK-NEXT: zip2 v3.4s, v6.4s, v16.4s +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #0 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #0 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #0 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #0 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #90 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #90 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #90 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll @@ -8,12 +8,7 @@ define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: complex_add_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fsub d1, d1, d2 -; CHECK-NEXT: fadd d0, d3, d0 -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fcadd v0.2d, v1.2d, v0.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> @@ -30,14 +25,8 @@ define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: complex_add_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v5.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: fsub v2.2d, v4.2d, v0.2d -; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: fcadd v0.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v3.2d, v1.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> @@ -54,22 +43,10 @@ define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: complex_add_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d -; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d -; CHECK-NEXT: fsub v5.2d, v16.2d, v0.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v18.2d -; CHECK-NEXT: fsub v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fadd v4.2d, v4.2d, v17.2d -; CHECK-NEXT: zip1 v0.2d, v5.2d, v2.2d -; CHECK-NEXT: zip2 v1.2d, v5.2d, v2.2d -; CHECK-NEXT: zip1 v2.2d, v3.2d, v4.2d -; CHECK-NEXT: zip2 v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fcadd v2.2d, v6.2d, v2.2d, #90 +; CHECK-NEXT: fcadd v0.2d, v4.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v5.2d, v1.2d, #90 +; CHECK-NEXT: fcadd v3.2d, v7.2d, v3.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll @@ -7,13 +7,10 @@ define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul d4, d3, d0 -; CHECK-NEXT: fmul d3, d2, d3 -; CHECK-NEXT: fmadd d2, d1, d2, d4 -; CHECK-NEXT: fnmsub d0, d1, d0, d3 -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> @@ -34,17 +31,14 @@ define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: complex_mul_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d -; CHECK-NEXT: fmul v3.2d, v4.2d, v0.2d -; CHECK-NEXT: fneg v1.2d, v1.2d -; CHECK-NEXT: fmla v3.2d, v5.2d, v2.2d -; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -65,28 +59,22 @@ define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: complex_mul_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v6.2d, v7.2d -; CHECK-NEXT: zip1 v3.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v0.2d, v16.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v6.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v7.2d, v16.2d, v18.2d -; CHECK-NEXT: fneg v4.2d, v4.2d -; CHECK-NEXT: fmul v16.2d, v2.2d, v17.2d -; CHECK-NEXT: fneg v6.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v0.2d, v3.2d -; CHECK-NEXT: fmla v4.2d, v18.2d, v3.2d -; CHECK-NEXT: fmla v16.2d, v1.2d, v5.2d -; CHECK-NEXT: fmla v6.2d, v17.2d, v5.2d -; CHECK-NEXT: zip1 v0.2d, v4.2d, v7.2d -; CHECK-NEXT: zip2 v1.2d, v4.2d, v7.2d -; CHECK-NEXT: zip1 v2.2d, v6.2d, v16.2d -; CHECK-NEXT: zip2 v3.2d, v6.2d, v16.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32>