Index: clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c =================================================================== --- clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c +++ clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c @@ -103,7 +103,7 @@ // COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 // UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) // CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}] +// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} // COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> // COMMONIR: ret <1 x double> [[TMP7]] float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { @@ -122,7 +122,7 @@ // COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 // UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) // CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}] +// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} // COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> // COMMONIR: ret <1 x double> [[TMP7]] float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5377,6 +5377,44 @@ (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> { let Inst{23-22} = 0b01; // 64-bit size flag } + + let Predicates = [HasFullFP16] in { + def : Pat<(f16 (node (f16 FPR16:$Rn), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))), + (f16 FPR16:$Ra))), + (!cast(NAME # Hrrr) + FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub), FPR16:$Ra)>; + + def : Pat<(f16 (node (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))), + (f16 FPR16:$Rm), + (f16 FPR16:$Ra))), + (!cast(NAME # Hrrr) + (EXTRACT_SUBREG V128:$Rn, hsub), FPR16:$Rm, FPR16:$Ra)>; + } + + def : Pat<(f32 (node (f32 FPR32:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))), + (f32 FPR32:$Ra))), + (!cast(NAME # Srrr) + FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>; + + def : Pat<(f32 (node (f32 (vector_extract (v4f32 V128:$Rn), (i64 0))), + (f32 FPR32:$Rm), + (f32 FPR32:$Ra))), + (!cast(NAME # Srrr) + (EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>; + + def : Pat<(f64 (node (f64 FPR64:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))), + (f64 FPR64:$Ra))), + (!cast(NAME # Drrr) + FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>; + + def : Pat<(f64 (node (f64 (vector_extract (v2f64 V128:$Rn), (i64 0))), + (f64 FPR64:$Rm), + (f64 FPR64:$Ra))), + (!cast(NAME # Drrr) + (EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>; } //--- Index: llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll =================================================================== --- llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll +++ llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll @@ -7,16 +7,15 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { ; CHECK-LABEL: complex_mul_v2f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: fmul h4, h0, v1.h[1] -; CHECK-NEXT: fnmul h2, h3, h2 -; CHECK-NEXT: fmla h4, h3, v1.h[0] -; CHECK-NEXT: fmla h2, h0, v1.h[0] -; CHECK-NEXT: mov v2.h[1], v4.h[0] -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: mov h2, v0.h[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul h3, h0, v1.h[1] +; CHECK-NEXT: fmul h4, h2, v1.h[1] +; CHECK-NEXT: fmadd h2, h1, h2, h3 +; CHECK-NEXT: fnmsub h0, h1, h0, h4 +; CHECK-NEXT: mov v0.h[1], v2.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -80,11 +80,11 @@ ret <8 x half> %0 } -define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vfmah_lane_f16: +define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_lane_f16_0: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmla h0, h1, v2.h[0] +; CHECK-NEXT: fmadd h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %extract = extractelement <4 x half> %c, i32 0 @@ -92,10 +92,34 @@ ret half %0 } -define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vfmah_laneq_f16: +define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_lane_f16_0_swap: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmla h0, h1, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmadd h0, h2, h1, h0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i32 0 + %0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a) + ret half %0 +} + +define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_lane_f16_3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i32 3 + %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + +define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_laneq_f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd h0, h1, h2, h0 ; CHECK-NEXT: ret entry: %extract = extractelement <8 x half> %c, i32 0 @@ -103,6 +127,28 @@ ret half %0 } +define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_laneq_f16_0_swap: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd h0, h2, h1, h0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i32 0 + %0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a) + ret half %0 +} + +define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_laneq_f16_7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i32 7 + %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) { ; CHECK-LABEL: t_vfms_lane_f16: ; CHECK: // %bb.0: // %entry @@ -181,23 +227,49 @@ ret <8 x half> %0 } -define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vfmsh_lane_f16: +define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_lane_f16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmsub h0, h2, h1, h0 +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <4 x half> %c, i32 0 + %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +} + +define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_lane_f16_0_swap: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmls h0, h1, v2.h[0] +; CHECK-NEXT: fmsub h0, h2, h1, h0 ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b %extract = extractelement <4 x half> %c, i32 0 + %1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a) + ret half %1 +} + +define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_lane_f16_3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <4 x half> %c, i32 3 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a) ret half %1 } -define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vfmsh_laneq_f16: +define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_laneq_f16_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmls h0, h1, v2.h[0] +; CHECK-NEXT: fmsub h0, h2, h1, h0 ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -206,6 +278,30 @@ ret half %1 } +define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub h0, h2, h1, h0 +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <8 x half> %c, i32 0 + %1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a) + ret half %1 +} + +define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_laneq_f16_7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <8 x half> %c, i32 7 + %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +} + define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) { ; CHECK-LABEL: t_vmul_laneq_f16: ; CHECK: // %bb.0: // %entry Index: llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll +++ llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll @@ -7,56 +7,132 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) -define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) { - ; CHECK-LABEL: test_fmla_ss4S +define float @test_fmla_ss4S_0(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_0 + ; CHECK: fmadd s0, s1, s2, s0 + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss4S_0_swap(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_0_swap + ; CHECK: fmadd s0, s2, s1, s0 + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss4S_3(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_3 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %tmp2 } -define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) { - ; CHECK-LABEL: test_fmla_ss4S_swap +define float @test_fmla_ss4S_3_swap(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmla_ss4S_3_swap ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a) ret float %tmp2 } -define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) { - ; CHECK-LABEL: test_fmla_ss2S +define float @test_fmla_ss2S_0(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmla_ss2S_0 + ; CHECK: fmadd s0, s1, s2, s0 + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss2S_0_swap(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmla_ss2S_0_swap + ; CHECK: fmadd s0, s2, s1, s0 + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmla_ss2S_1 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %tmp2 } -define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) { - ; CHECK-LABEL: test_fmla_ddD - ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} +define double @test_fmla_ddD_0(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmla_ddD_0 + ; CHECK: fmadd d0, d1, d2, d0 %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) ret double %tmp2 } -define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) { - ; CHECK-LABEL: test_fmla_dd2D +define double @test_fmla_ddD_0_swap(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmla_ddD_0_swap + ; CHECK: fmadd d0, d2, d1, d0 + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a) + ret double %tmp2 +} + +define double @test_fmla_dd2D_0(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_0 + ; CHECK: fmadd d0, d1, d2, d0 + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %tmp2 +} + +define double @test_fmla_dd2D_0_swap(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_0_swap + ; CHECK: fmadd d0, d2, d1, d0 + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a) + ret double %tmp2 +} + +define double @test_fmla_dd2D_1(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_1 ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) ret double %tmp2 } -define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) { - ; CHECK-LABEL: test_fmla_dd2D_swap +define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmla_dd2D_1_swap ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a) ret double %tmp2 } -define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) { - ; CHECK-LABEL: test_fmls_ss4S +define float @test_fmls_ss4S_0(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_0 + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + +define float @test_fmls_ss4S_0_swap(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_0_swap + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %extract, float %fneg, float %a) + ret float %0 +} + +define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_3 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fsub float -0.0, %tmp1 @@ -64,8 +140,8 @@ ret float %tmp3 } -define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) { - ; CHECK-LABEL: test_fmls_ss4S_swap +define float @test_fmls_ss4S_3_swap(float %a, float %b, <4 x float> %v) { + ; CHECK-LABEL: test_fmls_ss4S_3_swap ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fsub float -0.0, %tmp1 @@ -74,8 +150,28 @@ } -define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) { - ; CHECK-LABEL: test_fmls_ss2S +define float @test_fmls_ss2S_0(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmls_ss2S_0 + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + +define float @test_fmls_ss2S_0_swap(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmls_ss2S_0_swap + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 0 + %0 = tail call float @llvm.fma.f32(float %extract, float %fneg, float %a) + ret float %0 +} + +define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) { + ; CHECK-LABEL: test_fmls_ss2S_1 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = fsub float -0.0, %tmp1 @@ -83,17 +179,48 @@ ret float %tmp3 } -define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) { - ; CHECK-LABEL: test_fmls_ddD - ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} - %tmp1 = extractelement <1 x double> %v, i32 0 - %tmp2 = fsub double -0.0, %tmp1 - %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a) - ret double %tmp3 +define double @test_fmls_ddD_0(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmls_ddD_0 + ; CHECK: fmsub d0, d1, d2, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <1 x double> %v, i64 0 + %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a) + ret double %0 +} + +define double @test_fmls_ddD_0_swap(double %a, double %b, <1 x double> %v) { + ; CHECK-LABEL: test_fmls_ddD_0_swap + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <1 x double> %v, i64 0 + %0 = tail call double @llvm.fma.f64(double %extract, double %fneg, double %a) + ret double %0 +} + +define double @test_fmls_dd2D_0(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_0 + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 0 + %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a) + ret double %0 +} + +define double @test_fmls_dd2D_0_swap(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_0_swap + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 0 + %0 = tail call double @llvm.fma.f64(double %extract, double %fneg, double %a) + ret double %0 } -define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) { - ; CHECK-LABEL: test_fmls_dd2D +define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_1 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fsub double -0.0, %tmp1 @@ -101,8 +228,8 @@ ret double %tmp3 } -define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) { - ; CHECK-LABEL: test_fmls_dd2D_swap +define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) { + ; CHECK-LABEL: test_fmls_dd2D_1_swap ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fsub double -0.0, %tmp1 @@ -110,56 +237,132 @@ ret double %tmp3 } -define float @test_fmla_ss4S_strict(float %a, float %b, <4 x float> %v) #0 { - ; CHECK-LABEL: test_fmla_ss4S_strict +define float @test_fmla_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss4S_0_strict + ; CHECK: fmadd s0, s1, s2, s0 + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %tmp2 +} + +define float @test_fmla_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss4S_0_swap_strict + ; CHECK: fmadd s0, s2, s1, s0 + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %tmp2 +} + +define float @test_fmla_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss4S_3_strict ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %tmp2 } -define float @test_fmla_ss4S_swap_strict(float %a, float %b, <4 x float> %v) #0 { - ; CHECK-LABEL: test_fmla_ss4S_swap_strict +define float @test_fmla_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss4S_3_swap_strict ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %tmp2 } -define float @test_fmla_ss2S_strict(float %a, float %b, <2 x float> %v) #0 { - ; CHECK-LABEL: test_fmla_ss2S_strict +define float @test_fmla_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss2S_0_strict + ; CHECK: fmadd s0, s1, s2, s0 + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %tmp2 +} + +define float @test_fmla_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss2S_0_swap_strict + ; CHECK: fmadd s0, s2, s1, s0 + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %tmp2 +} + +define float @test_fmla_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmla_ss2S_1_strict ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %tmp2 } -define double @test_fmla_ddD_strict(double %a, double %b, <1 x double> %v) #0 { - ; CHECK-LABEL: test_fmla_ddD_strict - ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} +define double @test_fmla_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_ddD_0_strict + ; CHECK: fmadd d0, d1, d2, d0 %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %tmp2 } -define double @test_fmla_dd2D_strict(double %a, double %b, <2 x double> %v) #0 { - ; CHECK-LABEL: test_fmla_dd2D_strict +define double @test_fmla_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_ddD_0_swap_strict + ; CHECK: fmadd d0, d2, d1, d0 + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %tmp2 +} + +define double @test_fmla_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_dd2D_0_strict + ; CHECK: fmadd d0, d1, d2, d0 + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %tmp2 +} + +define double @test_fmla_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_dd2D_0_swap_strict + ; CHECK: fmadd d0, d2, d1, d0 + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %tmp2 +} + +define double @test_fmla_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_dd2D_1_strict ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %tmp2 } -define double @test_fmla_dd2D_swap_strict(double %a, double %b, <2 x double> %v) #0 { - ; CHECK-LABEL: test_fmla_dd2D_swap_strict +define double @test_fmla_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmla_dd2D_1_swap_strict ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %tmp2 } -define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) #0 { - ; CHECK-LABEL: test_fmls_ss4S_strict +define float @test_fmls_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss4S_0_strict + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 0 + %0 = tail call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %0 +} + +define float @test_fmls_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss4S_0_swap_strict + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <4 x float> %v, i64 0 + %0 = tail call float @llvm.experimental.constrained.fma.f32(float %extract, float %fneg, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %0 +} + +define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss4S_3_strict ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fneg float %tmp1 @@ -167,8 +370,8 @@ ret float %tmp3 } -define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) #0 { - ; CHECK-LABEL: test_fmls_ss4S_swap_strict +define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss4S_3_swap_strict ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fneg float %tmp1 @@ -176,8 +379,28 @@ ret float %tmp3 } -define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) #0 { - ; CHECK-LABEL: test_fmls_ss2S_strict +define float @test_fmls_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss2S_0_strict + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 0 + %0 = tail call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %0 +} + +define float @test_fmls_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss2S_0_swap_strict + ; CHECK: fmsub s0, s2, s1, s0 +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %v, i64 0 + %0 = tail call float @llvm.experimental.constrained.fma.f32(float %extract, float %fneg, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %0 +} + +define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 { + ; CHECK-LABEL: test_fmls_ss2S_1_strict ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = fneg float %tmp1 @@ -185,17 +408,48 @@ ret float %tmp3 } -define double @test_fmls_ddD_strict(double %a, double %b, <1 x double> %v) #0 { - ; CHECK-LABEL: test_fmls_ddD_strict - ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}} - %tmp1 = extractelement <1 x double> %v, i32 0 - %tmp2 = fneg double %tmp1 - %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") - ret double %tmp3 +define double @test_fmls_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_ddD_0_strict + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <1 x double> %v, i64 0 + %0 = tail call double @llvm.experimental.constrained.fma.f64(double %fneg, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %0 +} + +define double @test_fmls_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_ddD_0_swap_strict + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <1 x double> %v, i64 0 + %0 = tail call double @llvm.experimental.constrained.fma.f64(double %extract, double %fneg, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %0 +} + +define double @test_fmls_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_dd2D_0_strict + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 0 + %0 = tail call double @llvm.experimental.constrained.fma.f64(double %fneg, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %0 +} + +define double @test_fmls_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_dd2D_0_swap_strict + ; CHECK: fmsub d0, d2, d1, d0 +entry: + %fneg = fneg double %b + %extract = extractelement <2 x double> %v, i64 0 + %0 = tail call double @llvm.experimental.constrained.fma.f64(double %extract, double %fneg, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret double %0 } -define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) #0 { - ; CHECK-LABEL: test_fmls_dd2D_strict +define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_dd2D_1_strict ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fneg double %tmp1 @@ -203,8 +457,8 @@ ret double %tmp3 } -define double @test_fmls_dd2D_swap_strict(double %a, double %b, <2 x double> %v) #0 { - ; CHECK-LABEL: test_fmls_dd2D_swap_strict +define double @test_fmls_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 { + ; CHECK-LABEL: test_fmls_dd2D_1_swap_strict ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fneg double %tmp1