Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4437,6 +4437,19 @@ } defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>; +// Match scalar FMUL instead of indexed FMUL when extracting lane 0. +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (any_fmul (f16 FPR16:$Rn), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))), + (FMULHrr FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>; +} +def : Pat<(f32 (any_fmul (f32 FPR32:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))), + (FMULSrr FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>; +def : Pat<(f64 (any_fmul (f64 FPR64:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))), + (FMULDrr FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>; + // Match reassociated forms of FNMUL. def : Pat<(fmul (fneg FPR16:$a), (f16 FPR16:$b)), (FNMULHrr FPR16:$a, FPR16:$b)>, @@ -4467,6 +4480,50 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", TriOpFrag<(any_fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; +// Match FMADD instead of indexed FMLA when extracting lane 0. +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (any_fma (f16 FPR16:$Rn), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))), + (f16 FPR16:$Ra))), + (FMADDHrrr FPR16:$Rn, + (EXTRACT_SUBREG V128:$Rm, hsub), + FPR16:$Ra)>; +} +def : Pat<(f32 (any_fma (f32 FPR32:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))), + (f32 FPR32:$Ra))), + (FMADDSrrr FPR32:$Rn, + (EXTRACT_SUBREG V128:$Rm, ssub), + FPR32:$Ra)>; +def : Pat<(f64 (any_fma (f64 FPR64:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))), + (f64 FPR64:$Ra))), + (FMADDDrrr FPR64:$Rn, + (EXTRACT_SUBREG V128:$Rm, dsub), + FPR64:$Ra)>; + +// Match FMSUB instead of indexed FMLS when extracting lane 0. +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (fma (f16 (fneg (f16 FPR16:$Rn))), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))), + (f16 FPR16:$Ra))), + (FMSUBHrrr FPR16:$Rn, + (EXTRACT_SUBREG V128:$Rm, hsub), + FPR16:$Ra)>; +} +def : Pat<(f32 (fma (f32 (fneg (f32 FPR32:$Rn))), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))), + (f32 FPR32:$Ra))), + (FMSUBSrrr FPR32:$Rn, + (EXTRACT_SUBREG V128:$Rm, ssub), + FPR32:$Ra)>; +def : Pat<(f64 (fma (f64 (fneg (f64 FPR64:$Rn))), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))), + (f64 FPR64:$Ra))), + (FMSUBDrrr FPR64:$Rn, + (EXTRACT_SUBREG V128:$Rm, dsub), + FPR64:$Ra)>; + // The following def pats catch the case where the LHS of an FMA is negated. // The TriOpFrag above catches the case where the middle operand is negated. @@ -5210,6 +5267,23 @@ defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>; +// Match scalar FMULX instead of indexed FMULX when extracting lane 0. +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(f16 (int_aarch64_neon_fmulx + (f16 FPR16:$Rn), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))), + (FMULX16 FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>; +} +let Predicates = [HasNEON] in { +def : Pat<(f32 (int_aarch64_neon_fmulx + (f32 FPR32:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))), + (FMULX32 FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>; +def : Pat<(f64 (int_aarch64_neon_fmulx + (f64 FPR64:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))), + (FMULX64 FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>; +} defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>; defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; @@ -6819,6 +6893,16 @@ VectorIndexS:$idx))), (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; + // Match FMSUB instead of FMLS when extracting lane 0. + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 (fneg V128:$Rm)), (i64 0)))), + (FMSUBSrrr FPR32:$Rd, FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>; + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i64 0))), + (i64 0)))), + (FMSUBSrrr FPR32:$Rd, FPR32:$Rn, (EXTRACT_SUBREG V64:$Rm, ssub))>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), @@ -6826,6 +6910,11 @@ VectorIndexS:$idx))), (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + // Match FMSUB instead of FMLS when extracting lane 0. + def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), + (vector_extract (v2f64 (fneg V128:$Rm)), (i64 0)))), + (FMSUBDrrr FPR64:$Rd, FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>; + } defm : FMLSIndexedAfterNegPatterns< @@ -6836,6 +6925,30 @@ defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>; +// Match indexed FMUL instead of scalar FMUL if it might save a DUP. +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(f16 (any_fmul + (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))), + (f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))), + (FMULv1i16_indexed (EXTRACT_SUBREG V128:$Rn, hsub), + V128:$Rm, + VectorIndexH:$idx)>; +} +let Predicates = [HasNEON] in { +def : Pat<(f32 (any_fmul + (f32 (vector_extract (v4f32 V128:$Rn), (i64 0))), + (f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))), + (FMULv1i32_indexed (EXTRACT_SUBREG V128:$Rn, ssub), + V128:$Rm, + VectorIndexS:$idx)>; +def : Pat<(f64 (any_fmul + (f64 (vector_extract (v2f64 V128:$Rn), (i64 0))), + (f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx)))), + (FMULv1i64_indexed (EXTRACT_SUBREG V128:$Rn, dsub), + V128:$Rm, + VectorIndexD:$idx)>; +} + def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), Index: llvm/test/CodeGen/AArch64/arm64-fma-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -17,7 +17,7 @@ ; CHECK-LABEL: %for.body ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 @@ -59,7 +59,7 @@ ; CHECK-LABEL: %for.body ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 Index: llvm/test/CodeGen/AArch64/arm64-fml-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-fml-combines.ll +++ llvm/test/CodeGen/AArch64/arm64-fml-combines.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: %for.body ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1 @@ -52,7 +52,7 @@ ; CHECK-LABEL: %for.body ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -558,6 +558,20 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfma_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: fmadd d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %lane = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %lane, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + +declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>) + define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64: ; CHECK: // %bb.0: // %entry @@ -582,6 +596,33 @@ ret <2 x double> %0 } +define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfms_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls d0, d1, v2.d[1] +; CHECK-NEXT: ret +entry: + %sub = fsub <2 x double> , %v + %lane = shufflevector <2 x double> %sub, <2 x double> undef, <1 x i32> + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %lane, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + +define <1 x double> @test_vfms_laneq_f64_alt(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfms_laneq_f64_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls d0, d1, v2.d[1] +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %2 = fneg double %1 + %extract = extractelement <2 x double> %v, i64 1 + %3 = tail call double @llvm.fma.f64(double %2, double %extract, double %0) + %4 = bitcast double %3 to <1 x double> + ret <1 x double> %4 +} + define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64: ; CHECK: // %bb.0: // %entry @@ -594,33 +635,31 @@ ret <2 x double> %0 } -define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) { -; CHECK-LABEL: test_vfmas_laneq_f32: +define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmas_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmla s0, s1, v2.s[3] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla s0, s1, v2.s[1] ; CHECK-NEXT: ret entry: - %extract = extractelement <4 x float> %v, i32 3 + %extract = extractelement <2 x float> %v, i32 1 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) ret float %0 } declare float @llvm.fma.f32(float, float, float) -define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) { -; CHECK-LABEL: test_vfmsd_lane_f64: +define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmas_laneq_f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: fmla s0, s1, v2.s[3] ; CHECK-NEXT: ret entry: - %extract.rhs = extractelement <1 x double> %v, i32 0 - %extract = fsub double -0.000000e+00, %extract.rhs - %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) - ret double %0 + %extract = extractelement <4 x float> %v, i32 3 + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 } -declare double @llvm.fma.f64(double, double, double) - define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32: ; CHECK: // %bb.0: // %entry @@ -634,6 +673,19 @@ ret float %0 } +define float @test_vfmss_lane_f32_alt(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmss_lane_f32_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls s0, s1, v2.s[1] +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 1 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32: ; CHECK: // %bb.0: // %entry @@ -646,6 +698,31 @@ ret float %0 } +define float @test_vfmss_laneq_f32_alt(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmss_laneq_f32_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls s0, s1, v2.s[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 3 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + +define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmad_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla d0, d1, v2.d[1] +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x double> %v, i32 1 + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +declare double @llvm.fma.f64(double, double, double) + define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64: ; CHECK: // %bb.0: // %entry @@ -658,55 +735,148 @@ ret double %0 } -define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) { -; CHECK-LABEL: test_vfmsd_lane_f64_0: +define double @test_vfmsd_laneq_f64_alt(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmsd_laneq_f64_alt: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: fmls d0, d1, v2.d[1] ; CHECK-NEXT: ret entry: - %tmp0 = fsub <1 x double> , %v - %tmp1 = extractelement <1 x double> %tmp0, i32 0 - %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a) + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 1 + %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a) ret double %0 } +define float @test_vfmas_lane_f32_0(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmas_lane_f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmadd s0, s1, s2, s0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x float> %v, i32 0 + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + +define float @test_vfmas_laneq_f32_0(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmas_laneq_f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd s0, s1, s2, s0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x float> %v, i32 0 + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls s0, s1, v2.s[1] +; CHECK-NEXT: fmsub s0, s0, s1, s2 ; CHECK-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v - %tmp1 = extractelement <2 x float> %tmp0, i32 1 + %tmp1 = extractelement <2 x float> %tmp0, i32 0 %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %0 } +define float @test_vfmss_lane_f32_0_alt(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmss_lane_f32_0_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmsub s0, s1, s2, s0 +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls s0, s1, v2.s[3] +; CHECK-NEXT: fmsub s0, s0, s1, s2 ; CHECK-NEXT: ret entry: %tmp0 = fsub <4 x float>, %v - %tmp1 = extractelement <4 x float> %tmp0, i32 3 + %tmp1 = extractelement <4 x float> %tmp0, i32 0 %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %0 } +define float @test_vfmss_laneq_f32_0_alt(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmss_laneq_f32_0_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub s0, s1, s2, s0 +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + +define double @test_vfmad_lane_f64_0(double %a, double %b, <1 x double> %v) { +; CHECK-LABEL: test_vfmad_lane_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <1 x double> %v, i32 0 + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +define double @test_vfmad_laneq_f64_0(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmad_laneq_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x double> %v, i32 0 + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) { +; CHECK-LABEL: test_vfmsd_lane_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %tmp0 = fsub <1 x double> , %v + %tmp1 = extractelement <1 x double> %tmp0, i32 0 + %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %0 +} + define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls d0, d1, v2.d[1] +; CHECK-NEXT: fmsub d0, d0, d1, d2 ; CHECK-NEXT: ret entry: %tmp0 = fsub <2 x double>, %v - %tmp1 = extractelement <2 x double> %tmp0, i32 1 + %tmp1 = extractelement <2 x double> %tmp0, i32 0 %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a) ret double %0 } +define double @test_vfmsd_laneq_f64_0_alt(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmsd_laneq_f64_0_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 0 + %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a) + ret double %0 +} + define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16: ; CHECK: // %bb.0: // %entry @@ -2591,6 +2761,27 @@ ret <4 x float> %0 } +define <1 x double> @test_vfma_lane_f64_0(<1 x double> %a, <1 x double> %b, <1 x double> %v) { +; CHECK-LABEL: test_vfma_lane_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %v, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + +define <1 x double> @test_vfma_laneq_f64_0(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfma_laneq_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %lane = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> zeroinitializer + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %lane, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64_0: ; CHECK: // %bb.0: // %entry @@ -2602,6 +2793,44 @@ ret <2 x double> %0 } +define <1 x double> @test_vfms_lane_f64_0(<1 x double> %a, <1 x double> %b, <1 x double> %v) { +; CHECK-LABEL: test_vfms_lane_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %sub = fsub <1 x double> , %v + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %sub, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + +define <1 x double> @test_vfms_laneq_f64_0(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfms_laneq_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d0, d1, d2 +; CHECK-NEXT: ret +entry: + %sub = fsub <2 x double> , %v + %lane = shufflevector <2 x double> %sub, <2 x double> undef, <1 x i32> zeroinitializer + %0 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %lane, <1 x double> %b, <1 x double> %a) + ret <1 x double> %0 +} + +define <1 x double> @test_vfms_laneq_f64_0_alt(<1 x double> %a, <1 x double> %b, <2 x double> %v) { +; CHECK-LABEL: test_vfms_laneq_f64_0_alt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %2 = fneg double %1 + %extract = extractelement <2 x double> %v, i64 0 + %3 = tail call double @llvm.fma.f64(double %2, double %extract, double %0) + %4 = bitcast double %3 to <1 x double> + ret <1 x double> %4 +} + define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64_0: ; CHECK: // %bb.0: // %entry @@ -3560,7 +3789,7 @@ define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmul d0, d0, v1.d[0] +; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> Index: llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll @@ -126,7 +126,7 @@ define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { ; CHECK-LABEL: test_fmulx_laneq_f64_0: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx d0, d0, v1.d[0] +; CHECK-NEXT: fmulx d0, d0, d1 ; CHECK-NEXT: ret %tmp1 = extractelement <2 x double> %v, i32 0 %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1) Index: llvm/test/CodeGen/AArch64/arm64-vmul.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1166,8 +1166,40 @@ ret <2 x double> %tmp4 } -define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { -; CHECK-LABEL: fmul_lane_s: +define float @fmul_lane0_s(float %A, <2 x float> %vec) nounwind { +; CHECK-LABEL: fmul_lane0_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %B = extractelement <2 x float> %vec, i32 0 + %res = fmul float %A, %B + ret float %res +} + +define float @fmul_lane1_s(float %A, <2 x float> %vec) nounwind { +; CHECK-LABEL: fmul_lane1_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul.s s0, s0, v1[1] +; CHECK-NEXT: ret + %B = extractelement <2 x float> %vec, i32 1 + %res = fmul float %A, %B + ret float %res +} + +define float @fmul_laneq0_s(float %A, <4 x float> %vec) nounwind { +; CHECK-LABEL: fmul_laneq0_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %B = extractelement <4 x float> %vec, i32 0 + %res = fmul float %A, %B + ret float %res +} + +define float @fmul_laneq3_s(float %A, <4 x float> %vec) nounwind { +; CHECK-LABEL: fmul_laneq3_s: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul.s s0, s0, v1[3] ; CHECK-NEXT: ret @@ -1176,8 +1208,28 @@ ret float %res } -define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { -; CHECK-LABEL: fmul_lane_d: +define double @fmul_lane0_d(double %A, <1 x double> %vec) nounwind { +; CHECK-LABEL: fmul_lane0_d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret + %B = extractelement <1 x double> %vec, i32 0 + %res = fmul double %A, %B + ret double %res +} + +define double @fmul_laneq0_d(double %A, <2 x double> %vec) nounwind { +; CHECK-LABEL: fmul_laneq0_d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret + %B = extractelement <2 x double> %vec, i32 0 + %res = fmul double %A, %B + ret double %res +} + +define double @fmul_laneq1_d(double %A, <2 x double> %vec) nounwind { +; CHECK-LABEL: fmul_laneq1_d: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul.d d0, d0, v1[1] ; CHECK-NEXT: ret @@ -1745,8 +1797,40 @@ ret double %fmulx.i } -define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { -; CHECK-LABEL: fmulxs_lane: +define float @fmulxs_lane0(float %a, <2 x float> %vec) nounwind { +; CHECK-LABEL: fmulxs_lane0: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK-NEXT: ret + %b = extractelement <2 x float> %vec, i32 0 + %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind + ret float %fmulx.i +} + +define float @fmulxs_lane1(float %a, <2 x float> %vec) nounwind { +; CHECK-LABEL: fmulxs_lane1: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmulx.s s0, s0, v1[1] +; CHECK-NEXT: ret + %b = extractelement <2 x float> %vec, i32 1 + %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind + ret float %fmulx.i +} + +define float @fmulxs_laneq0(float %a, <4 x float> %vec) nounwind { +; CHECK-LABEL: fmulxs_laneq0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK-NEXT: ret + %b = extractelement <4 x float> %vec, i32 0 + %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind + ret float %fmulx.i +} + +define float @fmulxs_laneq3(float %a, <4 x float> %vec) nounwind { +; CHECK-LABEL: fmulxs_laneq3: ; CHECK: // %bb.0: ; CHECK-NEXT: fmulx.s s0, s0, v1[3] ; CHECK-NEXT: ret @@ -1755,8 +1839,28 @@ ret float %fmulx.i } -define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { -; CHECK-LABEL: fmulxd_lane: +define double @fmulxd_lane0(double %a, <1 x double> %vec) nounwind { +; CHECK-LABEL: fmulxd_lane0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK-NEXT: ret + %b = extractelement <1 x double> %vec, i32 0 + %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind + ret double %fmulx.i +} + +define double @fmulxd_laneq0(double %a, <2 x double> %vec) nounwind { +; CHECK-LABEL: fmulxd_laneq0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK-NEXT: ret + %b = extractelement <2 x double> %vec, i32 0 + %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind + ret double %fmulx.i +} + +define double @fmulxd_laneq1(double %a, <2 x double> %vec) nounwind { +; CHECK-LABEL: fmulxd_laneq1: ; CHECK: // %bb.0: ; CHECK-NEXT: fmulx.d d0, d0, v1[1] ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll =================================================================== --- llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll +++ llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll @@ -9,14 +9,14 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: fmul h4, h2, v0.h[0] -; CHECK-NEXT: fnmul h2, h3, h2 -; CHECK-NEXT: fmla h4, h3, v1.h[0] -; CHECK-NEXT: fmla h2, h0, v1.h[0] -; CHECK-NEXT: mov v2.h[1], v4.h[0] -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: mov h2, v0.h[1] +; CHECK-NEXT: mov h3, v1.h[1] +; CHECK-NEXT: fmul h4, h0, v1.h[1] +; CHECK-NEXT: fnmul h3, h2, h3 +; CHECK-NEXT: fmadd h2, h2, h1, h4 +; CHECK-NEXT: fmadd h0, h1, h0, h3 +; CHECK-NEXT: mov v0.h[1], v2.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -84,7 +84,7 @@ ; CHECK-LABEL: t_vfmah_lane_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmla h0, h1, v2.h[0] +; CHECK-NEXT: fmadd h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %extract = extractelement <4 x half> %c, i32 0 @@ -95,7 +95,7 @@ define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) { ; CHECK-LABEL: t_vfmah_laneq_f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmla h0, h1, v2.h[0] +; CHECK-NEXT: fmadd h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %extract = extractelement <8 x half> %c, i32 0 @@ -185,7 +185,7 @@ ; CHECK-LABEL: t_vfmsh_lane_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls h0, h1, v2.h[0] +; CHECK-NEXT: fmsub h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -197,7 +197,7 @@ define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) { ; CHECK-LABEL: t_vfmsh_laneq_f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls h0, h1, v2.h[0] +; CHECK-NEXT: fmsub h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -232,7 +232,7 @@ ; CHECK-LABEL: t_vmulh_lane_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmul h0, h0, v1.h[0] +; CHECK-NEXT: fmul h0, h0, h1 ; CHECK-NEXT: ret entry: %0 = extractelement <4 x half> %c, i32 0 @@ -243,7 +243,7 @@ define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) { ; CHECK-LABEL: t_vmulh_laneq_f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmul h0, h0, v1.h[0] +; CHECK-NEXT: fmul h0, h0, h1 ; CHECK-NEXT: ret entry: %0 = extractelement <8 x half> %c, i32 0 @@ -251,8 +251,8 @@ ret half %1 } -define dso_local half @t_vmulx_f16(half %a, half %b) { -; CHECK-LABEL: t_vmulx_f16: +define dso_local half @t_vmulxh_f16(half %a, half %b) { +; CHECK-LABEL: t_vmulxh_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmulx h0, h0, h1 ; CHECK-NEXT: ret @@ -265,10 +265,21 @@ ; CHECK-LABEL: t_vmulxh_lane_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmulx h0, h0, v1.h[3] +; CHECK-NEXT: fmulx h0, h0, h1 ; CHECK-NEXT: ret entry: - %extract = extractelement <4 x half> %b, i32 3 + %extract = extractelement <4 x half> %b, i32 0 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx h0, h0, h1 +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %b, i32 0 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) ret half %fmulx.i } @@ -319,17 +330,6 @@ ret <8 x half> %vmulx2.i } -define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) { -; CHECK-LABEL: t_vmulxh_laneq_f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmulx h0, h0, v1.h[7] -; CHECK-NEXT: ret -entry: - %extract = extractelement <8 x half> %b, i32 7 - %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) - ret half %fmulx.i -} - define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) { ; CHECK-LABEL: t_vmulx_n_f16: ; CHECK: // %bb.0: // %entry @@ -406,6 +406,52 @@ ret half %1 } +define dso_local half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_lane3_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul h0, h0, v1.h[3] +; CHECK-NEXT: ret +entry: + %0 = extractelement <4 x half> %c, i32 3 + %1 = fmul half %0, %a + ret half %1 +} + +define dso_local half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_laneq7_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul h0, h0, v1.h[7] +; CHECK-NEXT: ret +entry: + %0 = extractelement <8 x half> %c, i32 7 + %1 = fmul half %0, %a + ret half %1 +} + +define dso_local half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_lane3_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmulx h0, h0, v1.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %b, i32 3 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_laneq7_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx h0, h0, v1.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %b, i32 7 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) { ; CHECK-LABEL: t_fadd_vfmah_f16: ; CHECK: // %bb.0: // %entry Index: llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll +++ llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll @@ -70,15 +70,15 @@ ; CHECK-NEXT: fmul s4, s0, v0.s[1] ; CHECK-NEXT: fmul s4, s4, v0.s[2] ; CHECK-NEXT: fmul s0, s4, v0.s[3] -; CHECK-NEXT: fmul s0, s0, v1.s[0] +; CHECK-NEXT: fmul s0, s0, s1 ; CHECK-NEXT: fmul s0, s0, v1.s[1] ; CHECK-NEXT: fmul s0, s0, v1.s[2] ; CHECK-NEXT: fmul s0, s0, v1.s[3] -; CHECK-NEXT: fmul s0, s0, v2.s[0] +; CHECK-NEXT: fmul s0, s0, s2 ; CHECK-NEXT: fmul s0, s0, v2.s[1] ; CHECK-NEXT: fmul s0, s0, v2.s[2] ; CHECK-NEXT: fmul s0, s0, v2.s[3] -; CHECK-NEXT: fmul s0, s0, v3.s[0] +; CHECK-NEXT: fmul s0, s0, s3 ; CHECK-NEXT: fmul s0, s0, v3.s[1] ; CHECK-NEXT: fmul s0, s0, v3.s[2] ; CHECK-NEXT: fmul s0, s0, v3.s[3]