diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c @@ -89,7 +89,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> // UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) // CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <4 x half> [[FMLA]] float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_lane_f16(a, b, c, 3); @@ -105,7 +105,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmaq_lane_f16(a, b, c, 3); @@ -137,7 +137,7 @@ // COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_laneq_f16(a, b, c, 7); @@ -150,7 +150,7 @@ // COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 // UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a) // CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <4 x half> [[FMA]] float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfma_n_f16(a, b, c); @@ -167,7 +167,7 @@ // COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 // UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a) // CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMA]] float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmaq_n_f16(a, b, c); @@ -177,7 +177,7 @@ // COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 // UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) // CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}} +// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret half [[FMA]] float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmah_lane_f16(a, b, c, 3); @@ -187,7 +187,7 @@ // COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 // UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) // CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}} +// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret half [[FMA]] float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmah_laneq_f16(a, b, c, 7); @@ -195,7 +195,6 @@ // COMMON-LABEL: test_vfms_lane_f16 // COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK-ASM: fneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h // COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> // COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8> // COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> @@ -205,7 +204,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> // UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) // CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <4 x half> [[FMA]] float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_lane_f16(a, b, c, 3); @@ -213,7 +212,6 @@ // COMMON-LABEL: test_vfmsq_lane_f16 // COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK-ASM: fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h // COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> // COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> // COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> @@ -223,7 +221,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmsq_lane_f16(a, b, c, 3); @@ -259,7 +257,7 @@ // COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_laneq_f16(a, b, c, 7); @@ -267,14 +265,13 @@ // COMMON-LABEL: test_vfms_n_f16 // COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK-ASM: fneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h // COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0 // COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1 // COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2 // COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 // UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a) // CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <4 x half> [[FMA]] float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfms_n_f16(a, b, c); @@ -282,7 +279,6 @@ // COMMON-LABEL: test_vfmsq_n_f16 // COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK-ASM: fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h // COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0 // COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1 // COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2 @@ -293,7 +289,7 @@ // COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 // UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a) // CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMA]] float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmsq_n_f16(a, b, c); @@ -311,7 +307,7 @@ // COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 // UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) // CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}} +// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret half [[FMA]] float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmsh_lane_f16(a, b, c, 3); @@ -329,7 +325,7 @@ // COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 // UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) // CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}}, h{{[0-9]+}} +// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret half [[FMA]] float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmsh_laneq_f16(a, b, c, 7); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8052,6 +8052,34 @@ } multiclass SIMDFPIndexedTiedPatterns { + let Predicates = [HasNEON, HasFullFP16] in { + // Patterns for f16: DUPLANE, DUP scalar and vector_extract. + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64duplane16 (v8f16 V128:$Rm), + VectorIndexH:$idx))), + (!cast(INST # "v8i16_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexH:$idx)>; + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64dup (f16 FPR16Op:$Rm)))), + (!cast(INST # "v8i16_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR16Op:$Rm, hsub), (i64 0))>; + + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64duplane16 (v8f16 V128:$Rm), + VectorIndexS:$idx))), + (!cast(INST # "v4i16_indexed") + V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64dup (f16 FPR16Op:$Rm)))), + (!cast(INST # "v4i16_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), FPR16Op:$Rm, hsub), (i64 0))>; + + def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn), + (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx))), + (!cast(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn, + V128:$Rm, VectorIndexH:$idx)>; + } // Predicates = [HasNEON, HasFullFP16] + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -8086,15 +8114,11 @@ (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - // 2 variants for 32-bit scalar version: extract from .2s or from .4s + // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), - (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -14,8 +14,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer @@ -29,8 +28,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer @@ -43,8 +41,7 @@ ; CHECK: .Lt_vfma_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmla v0.4h, v1.4h, v2.4h +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer @@ -57,8 +54,7 @@ ; CHECK: .Lt_vfmaq_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v1.8h, v2.8h +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer @@ -72,8 +68,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit = insertelement <4 x half> undef, half %c, i32 0 @@ -88,8 +83,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit = insertelement <8 x half> undef, half %c, i32 0 @@ -104,7 +98,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmadd h0, h1, h2, h0 +; CHECK-NEXT: fmla h0, h1, v2.h[0] ; CHECK-NEXT: ret entry: %extract = extractelement <4 x half> %c, i32 0 @@ -117,7 +111,7 @@ ; CHECK: .Lt_vfmah_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: fmadd h0, h1, h2, h0 +; CHECK-NEXT: fmla h0, h1, v2.h[0] ; CHECK-NEXT: ret entry: %extract = extractelement <8 x half> %c, i32 0 @@ -131,9 +125,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fneg v1.4h, v1.4h -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <4 x half> , %b @@ -148,9 +140,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fneg v1.8h, v1.8h -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <8 x half> , %b @@ -164,8 +154,7 @@ ; CHECK: .Lt_vfms_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <4 x half> , %b @@ -179,8 +168,7 @@ ; CHECK: .Lt_vfmsq_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <8 x half> , %b @@ -195,9 +183,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-NEXT: fneg v1.4h, v1.4h -; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <4 x half> , %b @@ -213,9 +199,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-NEXT: fneg v1.8h, v1.8h -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <8 x half> , %b @@ -231,7 +215,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmsub h0, h1, h2, h0 +; CHECK-NEXT: fmls h0, h1, v2.h[0] ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -245,7 +229,7 @@ ; CHECK: .Lt_vfmsh_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: fmsub h0, h1, h2, h0 +; CHECK-NEXT: fmls h0, h1, v2.h[0] ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -438,8 +422,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov h2, v2.h[3] -; CHECK-NEXT: fmadd h0, h1, h2, h0 +; CHECK-NEXT: fmla h0, h1, v2.h[3] ; CHECK-NEXT: ret entry: %extract = extractelement <4 x half> %c, i32 3 @@ -452,8 +435,7 @@ ; CHECK: .Lt_vfmah_laneq7_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mov h2, v2.h[7] -; CHECK-NEXT: fmadd h0, h1, h2, h0 +; CHECK-NEXT: fmla h0, h1, v2.h[7] ; CHECK-NEXT: ret entry: %extract = extractelement <8 x half> %c, i32 7 @@ -467,8 +449,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov h2, v2.h[3] -; CHECK-NEXT: fmsub h0, h1, h2, h0 +; CHECK-NEXT: fmls h0, h1, v2.h[3] ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -482,8 +463,7 @@ ; CHECK: .Lt_vfmsh_laneq7_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mov h2, v2.h[7] -; CHECK-NEXT: fmsub h0, h1, h2, h0 +; CHECK-NEXT: fmls h0, h1, v2.h[7] ; CHECK-NEXT: ret entry: %0 = fsub half 0xH8000, %b @@ -498,8 +478,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: fadd v2.4h, v2.4h, v3.4h -; CHECK-NEXT: mov h2, v2.h[3] -; CHECK-NEXT: fmadd h0, h1, h2, h0 +; CHECK-NEXT: fmla h0, h1, v2.h[3] ; CHECK-NEXT: ret entry: %0 = fadd <4 x half> %c, %d