Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -264,6 +264,50 @@ DupMCID = &TII->get(AArch64::DUPv2i32lane); MulMCID = &TII->get(AArch64::FMULv2f32); break; + + // 1X64 instructions + case AArch64::FMLAv1i64_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::CPYi64); + MulMCID = &TII->get(AArch64::FMADDDrrr); + break; + case AArch64::FMLSv1i64_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::CPYi64); + MulMCID = &TII->get(AArch64::FMSUBDrrr); + break; + case AArch64::FMULXv1i64_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::CPYi64); + MulMCID = &TII->get(AArch64::FMULX64); + break; + case AArch64::FMULv1i64_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::CPYi64); + MulMCID = &TII->get(AArch64::FMULDrr); + break; + + // 1X32 instructions + case AArch64::FMLAv1i32_indexed: + RC = &AArch64::FPR32RegClass; + DupMCID = &TII->get(AArch64::CPYi32); + MulMCID = &TII->get(AArch64::FMADDSrrr); + break; + case AArch64::FMLSv1i32_indexed: + RC = &AArch64::FPR32RegClass; + DupMCID = &TII->get(AArch64::CPYi32); + MulMCID = &TII->get(AArch64::FMSUBSrrr); + break; + case AArch64::FMULXv1i32_indexed: + RC = &AArch64::FPR32RegClass; + DupMCID = &TII->get(AArch64::CPYi32); + MulMCID = &TII->get(AArch64::FMULX32); + break; + case AArch64::FMULv1i32_indexed: + RC = &AArch64::FPR32RegClass; + DupMCID = &TII->get(AArch64::CPYi32); + MulMCID = &TII->get(AArch64::FMULSrr); + break; } if (!shouldReplaceInstruction(MI.getParent()->getParent(), @@ -298,10 +342,26 @@ .addReg(SrcReg2, Src2IsKill) .addImm(LaneNumber); } - BuildMI(MBB, MI, DL, *MulMCID, MulDest) - .addReg(SrcReg0, Src0IsKill) - .addReg(SrcReg1, Src1IsKill) - .addReg(DupDest, Src2IsKill); + // Create the new non-indexed instruction + switch (MI.getOpcode()) { + default: + // for most SIMD instructions, use this to create instruction + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(SrcReg1, Src1IsKill) + .addReg(DupDest, Src2IsKill); + break; + case AArch64::FMLAv1i32_indexed: + case AArch64::FMLSv1i32_indexed: + case AArch64::FMLAv1i64_indexed: + case AArch64::FMLSv1i64_indexed: + // for floating point fmadd/fmsub instructions, use this + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg1, Src1IsKill) + .addReg(DupDest, Src2IsKill) + .addReg(SrcReg0, Src0IsKill); + break; + } } else if (MI.getNumOperands() == 4) { unsigned LaneNumber = MI.getOperand(3).getImm(); if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -8,6 +8,10 @@ declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) +declare float @llvm.aarch64.neon.fmulx.f32(float, float) + +declare double @llvm.aarch64.neon.fmulx.f64(double, double) + declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) @@ -530,6 +534,19 @@ ret <2 x double> %0 } +define double @test_vfmla_lane_f64(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmla_lane_f64: +; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmla_lane_f64 +; EXYNOS: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; EXYNOS-NEXT: ret +entry: + %extract = extractelement <2 x double> %v, i32 1 + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmsq_lane_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] @@ -593,7 +610,7 @@ ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret ; EXYNOS-LABEL: test_vfmss_lane_f32 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 @@ -618,7 +635,7 @@ ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret ; EXYNOS-LABEL: test_vfmsd_laneq_f64 -; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 @@ -643,7 +660,7 @@ ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] ; CHECK-NEXT: ret ; EXYNOS-LABEL: test_vfmss_lane_f32_0 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; EXYNOS: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v @@ -1543,7 +1560,7 @@ ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] ; CHECK-NEXT: ret ; EXYNOS-LABEL: test_vmul_laneq_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> @@ -2058,6 +2075,19 @@ ret <4 x float> %0 } +define float @test_vfmla_lane_f32(float %a, float%b, <2 x float> %v) { +; CHECK-LABEL: test_vfmla_lane_f32: +; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vfmla_lane_f32 +; EXYNOS: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; EXYNOS-NEXT: ret +entry: + %extract = extractelement <2 x float> %v, i32 1 + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfms_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] @@ -2986,7 +3016,7 @@ ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] ; CHECK-NEXT: ret ; EXYNOS-LABEL: test_vmul_laneq_f64_0: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> @@ -3109,6 +3139,32 @@ ret <2 x double> %vmulx2.i } +define double @test_vmulx_lane_f64(double %a, <2 x double> %v) { +; CHECK-LABEL: test_vmulx_lane_f64: +; CHECK: mulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f64: +; EXYNOS: mulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; EXYNOS-NEXT: ret +entry: + %extract = extractelement <2 x double> %v, i32 1 + %0 = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %extract) + ret double %0 +} + +define float @test_vmulx_lane_f32_1(float %a, <2 x float> %v) { +; CHECK-LABEL: test_vmulx_lane_f32_1: +; CHECK: mulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +; EXYNOS-LABEL: test_vmulx_lane_f32_1: +; EXYNOS: mulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; EXYNOS-NEXT: ret +entry: + %extract = extractelement <2 x float> %v, i32 1 + %0 = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %extract) + ret float %0 +} + define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { ; CHECK-LABEL: optimize_dup: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]