diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c @@ -105,7 +105,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmaq_lane_f16(a, b, c, 3); @@ -213,7 +213,6 @@ // COMMON-LABEL: test_vfmsq_lane_f16 // COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK-ASM: fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h // COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> // COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> // COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> @@ -223,7 +222,7 @@ // COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) // CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] // COMMONIR: ret <8 x half> [[FMLA]] float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmsq_lane_f16(a, b, c, 3); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8052,6 +8052,15 @@ } multiclass SIMDFPIndexedTiedPatterns { + let Predicates = [HasNEON, HasFullFP16] in { + // 1 variant for the .8h version: DUPLANE from 128-bit + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128:$Rm), + VectorIndexS:$idx)))), + (!cast(INST # "v8i16_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + } // Predicates = [HasNEON, HasFullFP16] + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -29,8 +29,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer @@ -57,8 +56,7 @@ ; CHECK: .Lt_vfmaq_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v1.8h, v2.8h +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer @@ -148,9 +146,7 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fneg v1.8h, v1.8h -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <8 x half> , %b @@ -179,8 +175,7 @@ ; CHECK: .Lt_vfmsq_laneq_f16$local: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %sub = fsub <8 x half> , %b