diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -951,6 +951,7 @@ def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>; } + let Predicates = [HasComplxNum, HasNEON] in { def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>; @@ -975,14 +976,34 @@ (!cast("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>; } +multiclass FCMLA_LANE_PATS { + def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>; +} + + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { defm : FCMLA_PATS; defm : FCMLA_PATS; + + defm : FCMLA_LANE_PATS; + defm : FCMLA_LANE_PATS; } let Predicates = [HasComplxNum, HasNEON] in { defm : FCMLA_PATS; defm : FCMLA_PATS; defm : FCMLA_PATS; + + defm : FCMLA_LANE_PATS; } // v8.3a Pointer Authentication diff --git a/llvm/test/CodeGen/AArch64/neon-vcmla.ll b/llvm/test/CodeGen/AArch64/neon-vcmla.ll --- a/llvm/test/CodeGen/AArch64/neon-vcmla.ll +++ b/llvm/test/CodeGen/AArch64/neon-vcmla.ll @@ -9,6 +9,17 @@ ret <4 x half> %res } +define <4 x half> @test_16x4_lane_1(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +entry: +; CHECK-LABEL: test_16x4_lane_1 +; CHECK: fcmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[1], #0 +; + %c.cast = bitcast <4 x half> %c to <2 x i32> + %c.dup = shufflevector <2 x i32> %c.cast , <2 x i32> undef, <2 x i32> + %c.res = bitcast <2 x i32> %c.dup to <4 x half> + %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res) + ret <4 x half> %res +} define <4 x half> @test_rot90_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) { entry: @@ -19,6 +30,18 @@ ret <4 x half> %res } +define <4 x half> @test_rot90_16x4_lane_0(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +entry: +; CHECK-LABEL: test_rot90_16x4_lane_0 +; CHECK: fcmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[0], #90 +; + %c.cast = bitcast <4 x half> %c to <2 x i32> + %c.dup = shufflevector <2 x i32> %c.cast , <2 x i32> undef, <2 x i32> + %c.res = bitcast <2 x i32> %c.dup to <4 x half> + %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res) + ret <4 x half> %res +} + define <4 x half> @test_rot180_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) { entry: ; CHECK-LABEL: test_rot180_16x4 @@ -28,6 +51,18 @@ ret <4 x half> %res } +define <4 x half> @test_rot180_16x4_lane_0(<4 x half> %a, <4 x half> %b, <8 x half> %c) { +entry: +; CHECK-LABEL: test_rot180_16x4_lane_0 +; CHECK: fcmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[0], #180 + + %c.cast = bitcast <8 x half> %c to <4 x i32> + %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <2 x i32> + %c.res = bitcast <2 x i32> %c.dup to <4 x half> + %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res) + ret <4 x half> %res +} + define <4 x half> @test_rot270_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) { entry: ; CHECK-LABEL: test_rot270_16x4 @@ -82,6 +117,18 @@ ret <8 x half> %res } +define <8 x half> @test_16x8_lane_0(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +entry: +; CHECK-LABEL: test_16x8_lane_0 +; CHECK: fcmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[0], #0 +; + %c.cast = bitcast <8 x half> %c to <4 x i32> + %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> + %c.res = bitcast <4 x i32> %c.dup to <8 x half> + %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res) + ret <8 x half> %res +} + define <8 x half> @test_rot90_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) { entry: ; CHECK-LABEL: test_rot90_16x8 @@ -91,6 +138,18 @@ ret <8 x half> %res } +define <8 x half> @test_rot90_16x8_lane_1(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +entry: +; CHECK-LABEL: test_rot90_16x8_lane_1 +; CHECK: fcmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[1], #90 +; + %c.cast = bitcast <8 x half> %c to <4 x i32> + %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> + %c.res = bitcast <4 x i32> %c.dup to <8 x half> + %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res) + ret <8 x half> %res +} + define <8 x half> @test_rot180_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) { entry: ; CHECK-LABEL: test_rot180_16x8 @@ -100,6 +159,18 @@ ret <8 x half> %res } +define <8 x half> @test_rot180_16x8_lane_1(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +entry: +; CHECK-LABEL: test_rot180_16x8_lane_1 +; CHECK: fcmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[1], #180 +; + %c.cast = bitcast <8 x half> %c to <4 x i32> + %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> + %c.res = bitcast <4 x i32> %c.dup to <8 x half> + %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res) + ret <8 x half> %res +} + define <8 x half> @test_rot270_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) { entry: ; CHECK-LABEL: test_rot270_16x8 @@ -109,6 +180,18 @@ ret <8 x half> %res } +define <8 x half> @test_rot270_16x8_lane_0(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +entry: +; CHECK-LABEL: test_rot270_16x8_lane_0 +; CHECK: fcmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[0], #270 +; + %c.cast = bitcast <8 x half> %c to <4 x i32> + %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> + %c.res = bitcast <4 x i32> %c.dup to <8 x half> + %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res) + ret <8 x half> %res +} + define <4 x float> @test_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { entry: ; CHECK-LABEL: test_32x4 @@ -118,6 +201,18 @@ ret <4 x float> %res } +define <4 x float> @test_32x4_lane_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: +; CHECK-LABEL: test_32x4_lane_0 +; CHECK: fcmla v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0], #0 +; + %c.cast = bitcast <4 x float> %c to <2 x i64> + %c.dup = shufflevector <2 x i64> %c.cast , <2 x i64> undef, <2 x i32> + %c.res = bitcast <2 x i64> %c.dup to <4 x float> + %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c.res) + ret <4 x float> %res +} + define <4 x float> @test_rot90_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { entry: ; CHECK-LABEL: test_rot90_32x4