diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -801,6 +801,16 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22032,3 +22032,103 @@ unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } + +bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasComplxNum(); +} + +bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if (VTyWidth < 128 && VTyWidth != 64) + return false; + + // 32 is the length of SplitMask in createComplexDeinterleavingIR + if (NumElements > 32) + return false; + + if (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) + return NumElements == 4 || NumElements % 4 == 0; + if (ScalarTy->isFloatTy()) + return NumElements == 2 || NumElements == 4 || NumElements % 4 == 0; + if (ScalarTy->isDoubleTy()) + return NumElements == 2 || NumElements % 2 == 0; + return false; +} + +Value *AArch64TargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128 || TyWidth == 64); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + const int SplitMask[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + ArrayRef LowerSplitMask(&SplitMask[0], Stride); + ArrayRef UpperSplitMask(&SplitMask[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + auto *LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + auto *UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(SplitMask, Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + + Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, + Intrinsic::aarch64_neon_vcmla_rot90, + Intrinsic::aarch64_neon_vcmla_rot180, + Intrinsic::aarch64_neon_vcmla_rot270}; + + unsigned IntIdx = Rotation / 90; + IntId = IdMap[IntIdx]; + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + return B.CreateIntrinsic(IntId, Ty, {Accumulator, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + Intrinsic::ID IntId = Intrinsic::not_intrinsic; + if (Rotation == 90) + IntId = Intrinsic::aarch64_neon_vcadd_rot90; + else if (Rotation == 270) + IntId = Intrinsic::aarch64_neon_vcadd_rot270; + + if (IntId == Intrinsic::not_intrinsic) + return nullptr; + + return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -581,6 +581,10 @@ addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedLoadCombinePass()); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -85,6 +85,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: AArch64 Stack Tagging +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Interleaved Load Combine Pass diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-add.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-add.ll @@ -29,13 +29,7 @@ define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-LABEL: complex_add_v4f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp2 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp1 v3.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp2 v1.4h, v1.4h, v0.4h -; CHECK-NEXT: fsub v0.4h, v3.4h, v0.4h -; CHECK-NEXT: fadd v1.4h, v1.4h, v2.4h -; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: fcadd v0.4h, v1.4h, v0.4h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> @@ -51,17 +45,7 @@ define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_add_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: uzp1 v4.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp2 v0.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp1 v2.4h, v1.4h, v3.4h -; CHECK-NEXT: uzp2 v1.4h, v1.4h, v3.4h -; CHECK-NEXT: fsub v0.4h, v2.4h, v0.4h -; CHECK-NEXT: fadd v1.4h, v1.4h, v4.4h -; CHECK-NEXT: zip2 v2.4h, v0.4h, v1.4h -; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: fcadd v0.8h, v1.8h, v0.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> @@ -77,14 +61,8 @@ define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_add_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v4.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v5.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: fsub v2.8h, v4.8h, v0.8h -; CHECK-NEXT: fadd v1.8h, v1.8h, v5.8h -; CHECK-NEXT: zip1 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: zip2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: fcadd v0.8h, v2.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v3.8h, v1.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> @@ -100,22 +78,10 @@ define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_add_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v16.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v2.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp2 v4.8h, v6.8h, v7.8h -; CHECK-NEXT: fsub v5.8h, v16.8h, v0.8h -; CHECK-NEXT: fadd v2.8h, v2.8h, v18.8h -; CHECK-NEXT: fsub v3.8h, v3.8h, v1.8h -; CHECK-NEXT: fadd v4.8h, v4.8h, v17.8h -; CHECK-NEXT: zip1 v0.8h, v5.8h, v2.8h -; CHECK-NEXT: zip2 v1.8h, v5.8h, v2.8h -; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h -; CHECK-NEXT: zip2 v3.8h, v3.8h, v4.8h +; CHECK-NEXT: fcadd v2.8h, v6.8h, v2.8h, #90 +; CHECK-NEXT: fcadd v0.8h, v4.8h, v0.8h, #90 +; CHECK-NEXT: fcadd v1.8h, v5.8h, v1.8h, #90 +; CHECK-NEXT: fcadd v3.8h, v7.8h, v3.8h, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f16-mul.ll @@ -35,16 +35,10 @@ define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-LABEL: complex_mul_v4f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: uzp2 v3.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: fmul v4.4h, v2.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v0.4h -; CHECK-NEXT: fmul v3.4h, v3.4h, v0.4h -; CHECK-NEXT: fneg v4.4h, v4.4h -; CHECK-NEXT: fmla v3.4h, v2.4h, v1.4h -; CHECK-NEXT: fmla v4.4h, v0.4h, v1.4h -; CHECK-NEXT: zip1 v0.4h, v4.4h, v3.4h +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #0 +; CHECK-NEXT: fcmla v2.4h, v0.4h, v1.4h, #90 +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> @@ -64,20 +58,10 @@ define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: uzp2 v4.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v2.4h -; CHECK-NEXT: uzp2 v5.4h, v1.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v3.4h -; CHECK-NEXT: fmul v2.4h, v4.4h, v5.4h -; CHECK-NEXT: fmul v3.4h, v5.4h, v0.4h -; CHECK-NEXT: fneg v2.4h, v2.4h -; CHECK-NEXT: fmla v3.4h, v4.4h, v1.4h -; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h -; CHECK-NEXT: zip2 v1.4h, v2.4h, v3.4h -; CHECK-NEXT: zip1 v0.4h, v2.4h, v3.4h -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #0 +; CHECK-NEXT: fcmla v2.8h, v0.8h, v1.8h, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -97,17 +81,14 @@ define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v4.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v5.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-NEXT: fmul v1.8h, v5.8h, v4.8h -; CHECK-NEXT: fmul v3.8h, v4.8h, v0.8h -; CHECK-NEXT: fneg v1.8h, v1.8h -; CHECK-NEXT: fmla v3.8h, v5.8h, v2.8h -; CHECK-NEXT: fmla v1.8h, v0.8h, v2.8h -; CHECK-NEXT: zip1 v0.8h, v1.8h, v3.8h -; CHECK-NEXT: zip2 v1.8h, v1.8h, v3.8h +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #0 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #0 +; CHECK-NEXT: fcmla v4.8h, v0.8h, v2.8h, #90 +; CHECK-NEXT: fcmla v5.8h, v1.8h, v3.8h, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> @@ -127,28 +108,22 @@ define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v16.8h, v4.8h, v5.8h -; CHECK-NEXT: uzp1 v17.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v18.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp2 v2.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h -; CHECK-NEXT: fmul v4.8h, v0.8h, v16.8h -; CHECK-NEXT: uzp1 v5.8h, v6.8h, v7.8h -; CHECK-NEXT: fmul v6.8h, v1.8h, v2.8h -; CHECK-NEXT: fmul v7.8h, v16.8h, v18.8h -; CHECK-NEXT: fneg v4.8h, v4.8h -; CHECK-NEXT: fmul v16.8h, v2.8h, v17.8h -; CHECK-NEXT: fneg v6.8h, v6.8h -; CHECK-NEXT: fmla v7.8h, v0.8h, v3.8h -; CHECK-NEXT: fmla v4.8h, v18.8h, v3.8h -; CHECK-NEXT: fmla v16.8h, v1.8h, v5.8h -; CHECK-NEXT: fmla v6.8h, v17.8h, v5.8h -; CHECK-NEXT: zip1 v0.8h, v4.8h, v7.8h -; CHECK-NEXT: zip2 v1.8h, v4.8h, v7.8h -; CHECK-NEXT: zip1 v2.8h, v6.8h, v16.8h -; CHECK-NEXT: zip2 v3.8h, v6.8h, v16.8h +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #0 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #0 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #0 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #0 +; CHECK-NEXT: fcmla v16.8h, v0.8h, v4.8h, #90 +; CHECK-NEXT: fcmla v17.8h, v1.8h, v5.8h, #90 +; CHECK-NEXT: fcmla v18.8h, v2.8h, v6.8h, #90 +; CHECK-NEXT: fcmla v19.8h, v3.8h, v7.8h, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-add.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-add.ll @@ -6,13 +6,7 @@ define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: complex_add_v2f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2s, v0.s[1] -; CHECK-NEXT: dup v3.2s, v1.s[1] -; CHECK-NEXT: fsub v1.2s, v1.2s, v2.2s -; CHECK-NEXT: fadd v0.2s, v3.2s, v0.2s -; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: fcadd v0.2s, v1.2s, v0.2s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> @@ -28,17 +22,7 @@ define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_add_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fsub v0.2s, v2.2s, v0.2s -; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s -; CHECK-NEXT: zip2 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: fcadd v0.4s, v1.4s, v0.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> @@ -54,14 +38,8 @@ define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_add_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: fsub v2.4s, v4.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v1.4s, v5.4s -; CHECK-NEXT: zip1 v0.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: fcadd v0.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v3.4s, v1.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> @@ -77,22 +55,10 @@ define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: complex_add_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v16.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v3.4s, v6.4s, v7.4s -; CHECK-NEXT: uzp2 v4.4s, v6.4s, v7.4s -; CHECK-NEXT: fsub v5.4s, v16.4s, v0.4s -; CHECK-NEXT: fadd v2.4s, v2.4s, v18.4s -; CHECK-NEXT: fsub v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fadd v4.4s, v4.4s, v17.4s -; CHECK-NEXT: zip1 v0.4s, v5.4s, v2.4s -; CHECK-NEXT: zip2 v1.4s, v5.4s, v2.4s -; CHECK-NEXT: zip1 v2.4s, v3.4s, v4.4s -; CHECK-NEXT: zip2 v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fcadd v2.4s, v6.4s, v2.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v4.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v1.4s, v5.4s, v1.4s, #90 +; CHECK-NEXT: fcadd v3.4s, v7.4s, v3.4s, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f32-mul.ll @@ -6,16 +6,10 @@ define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: complex_mul_v2f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: dup v2.2s, v1.s[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v3.2s, v0.s[1] -; CHECK-NEXT: fmul v4.2s, v0.2s, v1.s[1] -; CHECK-NEXT: fmul v2.2s, v2.2s, v0.s[1] -; CHECK-NEXT: fmla v4.2s, v3.2s, v1.2s -; CHECK-NEXT: fneg v2.2s, v2.2s -; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v4.2s +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #0 +; CHECK-NEXT: fcmla v2.2s, v0.2s, v1.2s, #90 +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> @@ -35,20 +29,10 @@ define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v5.2s, v1.2s, v3.2s -; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fmul v2.2s, v4.2s, v5.2s -; CHECK-NEXT: fmul v3.2s, v5.2s, v0.2s -; CHECK-NEXT: fneg v2.2s, v2.2s -; CHECK-NEXT: fmla v3.2s, v4.2s, v1.2s -; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip2 v1.2s, v2.2s, v3.2s -; CHECK-NEXT: zip1 v0.2s, v2.2s, v3.2s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -68,17 +52,14 @@ define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v1.4s, v5.4s, v4.4s -; CHECK-NEXT: fmul v3.4s, v4.4s, v0.4s -; CHECK-NEXT: fneg v1.4s, v1.4s -; CHECK-NEXT: fmla v3.4s, v5.4s, v2.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: zip1 v0.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #0 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v0.4s, v2.4s, #90 +; CHECK-NEXT: fcmla v5.4s, v1.4s, v3.4s, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> @@ -98,28 +79,22 @@ define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 v16.4s, v4.4s, v5.4s -; CHECK-NEXT: uzp1 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp1 v18.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v6.4s, v7.4s -; CHECK-NEXT: uzp1 v3.4s, v4.4s, v5.4s -; CHECK-NEXT: fmul v4.4s, v0.4s, v16.4s -; CHECK-NEXT: uzp1 v5.4s, v6.4s, v7.4s -; CHECK-NEXT: fmul v6.4s, v1.4s, v2.4s -; CHECK-NEXT: fmul v7.4s, v16.4s, v18.4s -; CHECK-NEXT: fneg v4.4s, v4.4s -; CHECK-NEXT: fmul v16.4s, v2.4s, v17.4s -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fmla v7.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v4.4s, v18.4s, v3.4s -; CHECK-NEXT: fmla v16.4s, v1.4s, v5.4s -; CHECK-NEXT: fmla v6.4s, v17.4s, v5.4s -; CHECK-NEXT: zip1 v0.4s, v4.4s, v7.4s -; CHECK-NEXT: zip2 v1.4s, v4.4s, v7.4s -; CHECK-NEXT: zip1 v2.4s, v6.4s, v16.4s -; CHECK-NEXT: zip2 v3.4s, v6.4s, v16.4s +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #0 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #0 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #0 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #0 +; CHECK-NEXT: fcmla v16.4s, v0.4s, v4.4s, #90 +; CHECK-NEXT: fcmla v17.4s, v1.4s, v5.4s, #90 +; CHECK-NEXT: fcmla v18.4s, v2.4s, v6.4s, #90 +; CHECK-NEXT: fcmla v19.4s, v3.4s, v7.4s, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-add.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-add.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-add.ll @@ -6,12 +6,7 @@ define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: complex_add_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fsub d1, d1, d2 -; CHECK-NEXT: fadd d0, d3, d0 -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fcadd v0.2d, v1.2d, v0.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> @@ -27,14 +22,8 @@ define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: complex_add_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v5.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: fsub v2.2d, v4.2d, v0.2d -; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: fcadd v0.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v3.2d, v1.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> @@ -50,22 +39,10 @@ define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: complex_add_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d -; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d -; CHECK-NEXT: fsub v5.2d, v16.2d, v0.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v18.2d -; CHECK-NEXT: fsub v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fadd v4.2d, v4.2d, v17.2d -; CHECK-NEXT: zip1 v0.2d, v5.2d, v2.2d -; CHECK-NEXT: zip2 v1.2d, v5.2d, v2.2d -; CHECK-NEXT: zip1 v2.2d, v3.2d, v4.2d -; CHECK-NEXT: zip2 v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fcadd v2.2d, v6.2d, v2.2d, #90 +; CHECK-NEXT: fcadd v0.2d, v4.2d, v0.2d, #90 +; CHECK-NEXT: fcadd v1.2d, v5.2d, v1.2d, #90 +; CHECK-NEXT: fcadd v3.2d, v7.2d, v3.2d, #90 ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-mul.ll --- a/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-arithmetic-f64-mul.ll @@ -6,13 +6,10 @@ define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul d4, d3, d0 -; CHECK-NEXT: fmul d3, d2, d3 -; CHECK-NEXT: fmadd d2, d1, d2, d4 -; CHECK-NEXT: fnmsub d0, d1, d0, d3 -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v0.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> @@ -32,17 +29,14 @@ define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: complex_mul_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d -; CHECK-NEXT: fmul v3.2d, v4.2d, v0.2d -; CHECK-NEXT: fneg v1.2d, v1.2d -; CHECK-NEXT: fmla v3.2d, v5.2d, v2.2d -; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -62,28 +56,22 @@ define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: complex_mul_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v6.2d, v7.2d -; CHECK-NEXT: zip1 v3.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v0.2d, v16.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v6.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v7.2d, v16.2d, v18.2d -; CHECK-NEXT: fneg v4.2d, v4.2d -; CHECK-NEXT: fmul v16.2d, v2.2d, v17.2d -; CHECK-NEXT: fneg v6.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v0.2d, v3.2d -; CHECK-NEXT: fmla v4.2d, v18.2d, v3.2d -; CHECK-NEXT: fmla v16.2d, v1.2d, v5.2d -; CHECK-NEXT: fmla v6.2d, v17.2d, v5.2d -; CHECK-NEXT: zip1 v0.2d, v4.2d, v7.2d -; CHECK-NEXT: zip2 v1.2d, v4.2d, v7.2d -; CHECK-NEXT: zip1 v2.2d, v6.2d, v16.2d -; CHECK-NEXT: zip2 v3.2d, v6.2d, v16.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v5.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v2.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v3.2d, v7.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b +; CHECK-NEXT: mov v2.16b, v18.16b +; CHECK-NEXT: mov v3.16b, v19.16b ; CHECK-NEXT: ret entry: %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32>